root/arch/powerpc/mm/pgtable.c

/* [<][>][^][v][top][bottom][index][help] */

DEFINITIONS

This source file includes following definitions.
  1. is_exec_fault
  2. pte_looks_normal
  3. maybe_pte_to_page
  4. set_pte_filter_hash
  5. set_pte_filter_hash
  6. set_pte_filter
  7. set_access_flags_filter
  8. set_pte_at
  9. ptep_set_access_flags
  10. huge_ptep_set_access_flags
  11. assert_pte_locked
  12. vmalloc_to_phys
  13. __find_linux_pte

   1 // SPDX-License-Identifier: GPL-2.0-or-later
   2 /*
   3  * This file contains common routines for dealing with free of page tables
   4  * Along with common page table handling code
   5  *
   6  *  Derived from arch/powerpc/mm/tlb_64.c:
   7  *    Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org)
   8  *
   9  *  Modifications by Paul Mackerras (PowerMac) (paulus@cs.anu.edu.au)
  10  *  and Cort Dougan (PReP) (cort@cs.nmt.edu)
  11  *    Copyright (C) 1996 Paul Mackerras
  12  *
  13  *  Derived from "arch/i386/mm/init.c"
  14  *    Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
  15  *
  16  *  Dave Engebretsen <engebret@us.ibm.com>
  17  *      Rework for PPC64 port.
  18  */
  19 
  20 #include <linux/kernel.h>
  21 #include <linux/gfp.h>
  22 #include <linux/mm.h>
  23 #include <linux/percpu.h>
  24 #include <linux/hardirq.h>
  25 #include <linux/hugetlb.h>
  26 #include <asm/pgalloc.h>
  27 #include <asm/tlbflush.h>
  28 #include <asm/tlb.h>
  29 #include <asm/hugetlb.h>
  30 
  31 static inline int is_exec_fault(void)
  32 {
  33         return current->thread.regs && TRAP(current->thread.regs) == 0x400;
  34 }
  35 
  36 /* We only try to do i/d cache coherency on stuff that looks like
  37  * reasonably "normal" PTEs. We currently require a PTE to be present
  38  * and we avoid _PAGE_SPECIAL and cache inhibited pte. We also only do that
  39  * on userspace PTEs
  40  */
  41 static inline int pte_looks_normal(pte_t pte)
  42 {
  43 
  44         if (pte_present(pte) && !pte_special(pte)) {
  45                 if (pte_ci(pte))
  46                         return 0;
  47                 if (pte_user(pte))
  48                         return 1;
  49         }
  50         return 0;
  51 }
  52 
  53 static struct page *maybe_pte_to_page(pte_t pte)
  54 {
  55         unsigned long pfn = pte_pfn(pte);
  56         struct page *page;
  57 
  58         if (unlikely(!pfn_valid(pfn)))
  59                 return NULL;
  60         page = pfn_to_page(pfn);
  61         if (PageReserved(page))
  62                 return NULL;
  63         return page;
  64 }
  65 
  66 #ifdef CONFIG_PPC_BOOK3S
  67 
  68 /* Server-style MMU handles coherency when hashing if HW exec permission
  69  * is supposed per page (currently 64-bit only). If not, then, we always
  70  * flush the cache for valid PTEs in set_pte. Embedded CPU without HW exec
  71  * support falls into the same category.
  72  */
  73 
  74 static pte_t set_pte_filter_hash(pte_t pte)
  75 {
  76         if (radix_enabled())
  77                 return pte;
  78 
  79         pte = __pte(pte_val(pte) & ~_PAGE_HPTEFLAGS);
  80         if (pte_looks_normal(pte) && !(cpu_has_feature(CPU_FTR_COHERENT_ICACHE) ||
  81                                        cpu_has_feature(CPU_FTR_NOEXECUTE))) {
  82                 struct page *pg = maybe_pte_to_page(pte);
  83                 if (!pg)
  84                         return pte;
  85                 if (!test_bit(PG_arch_1, &pg->flags)) {
  86                         flush_dcache_icache_page(pg);
  87                         set_bit(PG_arch_1, &pg->flags);
  88                 }
  89         }
  90         return pte;
  91 }
  92 
  93 #else /* CONFIG_PPC_BOOK3S */
  94 
  95 static pte_t set_pte_filter_hash(pte_t pte) { return pte; }
  96 
  97 #endif /* CONFIG_PPC_BOOK3S */
  98 
  99 /* Embedded type MMU with HW exec support. This is a bit more complicated
 100  * as we don't have two bits to spare for _PAGE_EXEC and _PAGE_HWEXEC so
 101  * instead we "filter out" the exec permission for non clean pages.
 102  */
 103 static pte_t set_pte_filter(pte_t pte)
 104 {
 105         struct page *pg;
 106 
 107         if (mmu_has_feature(MMU_FTR_HPTE_TABLE))
 108                 return set_pte_filter_hash(pte);
 109 
 110         /* No exec permission in the first place, move on */
 111         if (!pte_exec(pte) || !pte_looks_normal(pte))
 112                 return pte;
 113 
 114         /* If you set _PAGE_EXEC on weird pages you're on your own */
 115         pg = maybe_pte_to_page(pte);
 116         if (unlikely(!pg))
 117                 return pte;
 118 
 119         /* If the page clean, we move on */
 120         if (test_bit(PG_arch_1, &pg->flags))
 121                 return pte;
 122 
 123         /* If it's an exec fault, we flush the cache and make it clean */
 124         if (is_exec_fault()) {
 125                 flush_dcache_icache_page(pg);
 126                 set_bit(PG_arch_1, &pg->flags);
 127                 return pte;
 128         }
 129 
 130         /* Else, we filter out _PAGE_EXEC */
 131         return pte_exprotect(pte);
 132 }
 133 
 134 static pte_t set_access_flags_filter(pte_t pte, struct vm_area_struct *vma,
 135                                      int dirty)
 136 {
 137         struct page *pg;
 138 
 139         if (mmu_has_feature(MMU_FTR_HPTE_TABLE))
 140                 return pte;
 141 
 142         /* So here, we only care about exec faults, as we use them
 143          * to recover lost _PAGE_EXEC and perform I$/D$ coherency
 144          * if necessary. Also if _PAGE_EXEC is already set, same deal,
 145          * we just bail out
 146          */
 147         if (dirty || pte_exec(pte) || !is_exec_fault())
 148                 return pte;
 149 
 150 #ifdef CONFIG_DEBUG_VM
 151         /* So this is an exec fault, _PAGE_EXEC is not set. If it was
 152          * an error we would have bailed out earlier in do_page_fault()
 153          * but let's make sure of it
 154          */
 155         if (WARN_ON(!(vma->vm_flags & VM_EXEC)))
 156                 return pte;
 157 #endif /* CONFIG_DEBUG_VM */
 158 
 159         /* If you set _PAGE_EXEC on weird pages you're on your own */
 160         pg = maybe_pte_to_page(pte);
 161         if (unlikely(!pg))
 162                 goto bail;
 163 
 164         /* If the page is already clean, we move on */
 165         if (test_bit(PG_arch_1, &pg->flags))
 166                 goto bail;
 167 
 168         /* Clean the page and set PG_arch_1 */
 169         flush_dcache_icache_page(pg);
 170         set_bit(PG_arch_1, &pg->flags);
 171 
 172  bail:
 173         return pte_mkexec(pte);
 174 }
 175 
 176 /*
 177  * set_pte stores a linux PTE into the linux page table.
 178  */
 179 void set_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep,
 180                 pte_t pte)
 181 {
 182         /*
 183          * Make sure hardware valid bit is not set. We don't do
 184          * tlb flush for this update.
 185          */
 186         VM_WARN_ON(pte_hw_valid(*ptep) && !pte_protnone(*ptep));
 187 
 188         /* Add the pte bit when trying to set a pte */
 189         pte = pte_mkpte(pte);
 190 
 191         /* Note: mm->context.id might not yet have been assigned as
 192          * this context might not have been activated yet when this
 193          * is called.
 194          */
 195         pte = set_pte_filter(pte);
 196 
 197         /* Perform the setting of the PTE */
 198         __set_pte_at(mm, addr, ptep, pte, 0);
 199 }
 200 
 201 /*
 202  * This is called when relaxing access to a PTE. It's also called in the page
 203  * fault path when we don't hit any of the major fault cases, ie, a minor
 204  * update of _PAGE_ACCESSED, _PAGE_DIRTY, etc... The generic code will have
 205  * handled those two for us, we additionally deal with missing execute
 206  * permission here on some processors
 207  */
 208 int ptep_set_access_flags(struct vm_area_struct *vma, unsigned long address,
 209                           pte_t *ptep, pte_t entry, int dirty)
 210 {
 211         int changed;
 212         entry = set_access_flags_filter(entry, vma, dirty);
 213         changed = !pte_same(*(ptep), entry);
 214         if (changed) {
 215                 assert_pte_locked(vma->vm_mm, address);
 216                 __ptep_set_access_flags(vma, ptep, entry,
 217                                         address, mmu_virtual_psize);
 218         }
 219         return changed;
 220 }
 221 
 222 #ifdef CONFIG_HUGETLB_PAGE
 223 int huge_ptep_set_access_flags(struct vm_area_struct *vma,
 224                                unsigned long addr, pte_t *ptep,
 225                                pte_t pte, int dirty)
 226 {
 227 #ifdef HUGETLB_NEED_PRELOAD
 228         /*
 229          * The "return 1" forces a call of update_mmu_cache, which will write a
 230          * TLB entry.  Without this, platforms that don't do a write of the TLB
 231          * entry in the TLB miss handler asm will fault ad infinitum.
 232          */
 233         ptep_set_access_flags(vma, addr, ptep, pte, dirty);
 234         return 1;
 235 #else
 236         int changed, psize;
 237 
 238         pte = set_access_flags_filter(pte, vma, dirty);
 239         changed = !pte_same(*(ptep), pte);
 240         if (changed) {
 241 
 242 #ifdef CONFIG_PPC_BOOK3S_64
 243                 struct hstate *h = hstate_vma(vma);
 244 
 245                 psize = hstate_get_psize(h);
 246 #ifdef CONFIG_DEBUG_VM
 247                 assert_spin_locked(huge_pte_lockptr(h, vma->vm_mm, ptep));
 248 #endif
 249 
 250 #else
 251                 /*
 252                  * Not used on non book3s64 platforms. But 8xx
 253                  * can possibly use tsize derived from hstate.
 254                  */
 255                 psize = 0;
 256 #endif
 257                 __ptep_set_access_flags(vma, ptep, pte, addr, psize);
 258         }
 259         return changed;
 260 #endif
 261 }
 262 #endif /* CONFIG_HUGETLB_PAGE */
 263 
 264 #ifdef CONFIG_DEBUG_VM
 265 void assert_pte_locked(struct mm_struct *mm, unsigned long addr)
 266 {
 267         pgd_t *pgd;
 268         pud_t *pud;
 269         pmd_t *pmd;
 270 
 271         if (mm == &init_mm)
 272                 return;
 273         pgd = mm->pgd + pgd_index(addr);
 274         BUG_ON(pgd_none(*pgd));
 275         pud = pud_offset(pgd, addr);
 276         BUG_ON(pud_none(*pud));
 277         pmd = pmd_offset(pud, addr);
 278         /*
 279          * khugepaged to collapse normal pages to hugepage, first set
 280          * pmd to none to force page fault/gup to take mmap_sem. After
 281          * pmd is set to none, we do a pte_clear which does this assertion
 282          * so if we find pmd none, return.
 283          */
 284         if (pmd_none(*pmd))
 285                 return;
 286         BUG_ON(!pmd_present(*pmd));
 287         assert_spin_locked(pte_lockptr(mm, pmd));
 288 }
 289 #endif /* CONFIG_DEBUG_VM */
 290 
 291 unsigned long vmalloc_to_phys(void *va)
 292 {
 293         unsigned long pfn = vmalloc_to_pfn(va);
 294 
 295         BUG_ON(!pfn);
 296         return __pa(pfn_to_kaddr(pfn)) + offset_in_page(va);
 297 }
 298 EXPORT_SYMBOL_GPL(vmalloc_to_phys);
 299 
 300 /*
 301  * We have 4 cases for pgds and pmds:
 302  * (1) invalid (all zeroes)
 303  * (2) pointer to next table, as normal; bottom 6 bits == 0
 304  * (3) leaf pte for huge page _PAGE_PTE set
 305  * (4) hugepd pointer, _PAGE_PTE = 0 and bits [2..6] indicate size of table
 306  *
 307  * So long as we atomically load page table pointers we are safe against teardown,
 308  * we can follow the address down to the the page and take a ref on it.
 309  * This function need to be called with interrupts disabled. We use this variant
 310  * when we have MSR[EE] = 0 but the paca->irq_soft_mask = IRQS_ENABLED
 311  */
 312 pte_t *__find_linux_pte(pgd_t *pgdir, unsigned long ea,
 313                         bool *is_thp, unsigned *hpage_shift)
 314 {
 315         pgd_t pgd, *pgdp;
 316         pud_t pud, *pudp;
 317         pmd_t pmd, *pmdp;
 318         pte_t *ret_pte;
 319         hugepd_t *hpdp = NULL;
 320         unsigned pdshift = PGDIR_SHIFT;
 321 
 322         if (hpage_shift)
 323                 *hpage_shift = 0;
 324 
 325         if (is_thp)
 326                 *is_thp = false;
 327 
 328         pgdp = pgdir + pgd_index(ea);
 329         pgd  = READ_ONCE(*pgdp);
 330         /*
 331          * Always operate on the local stack value. This make sure the
 332          * value don't get updated by a parallel THP split/collapse,
 333          * page fault or a page unmap. The return pte_t * is still not
 334          * stable. So should be checked there for above conditions.
 335          */
 336         if (pgd_none(pgd))
 337                 return NULL;
 338 
 339         if (pgd_is_leaf(pgd)) {
 340                 ret_pte = (pte_t *)pgdp;
 341                 goto out;
 342         }
 343 
 344         if (is_hugepd(__hugepd(pgd_val(pgd)))) {
 345                 hpdp = (hugepd_t *)&pgd;
 346                 goto out_huge;
 347         }
 348 
 349         /*
 350          * Even if we end up with an unmap, the pgtable will not
 351          * be freed, because we do an rcu free and here we are
 352          * irq disabled
 353          */
 354         pdshift = PUD_SHIFT;
 355         pudp = pud_offset(&pgd, ea);
 356         pud  = READ_ONCE(*pudp);
 357 
 358         if (pud_none(pud))
 359                 return NULL;
 360 
 361         if (pud_is_leaf(pud)) {
 362                 ret_pte = (pte_t *)pudp;
 363                 goto out;
 364         }
 365 
 366         if (is_hugepd(__hugepd(pud_val(pud)))) {
 367                 hpdp = (hugepd_t *)&pud;
 368                 goto out_huge;
 369         }
 370 
 371         pdshift = PMD_SHIFT;
 372         pmdp = pmd_offset(&pud, ea);
 373         pmd  = READ_ONCE(*pmdp);
 374 
 375         /*
 376          * A hugepage collapse is captured by this condition, see
 377          * pmdp_collapse_flush.
 378          */
 379         if (pmd_none(pmd))
 380                 return NULL;
 381 
 382 #ifdef CONFIG_PPC_BOOK3S_64
 383         /*
 384          * A hugepage split is captured by this condition, see
 385          * pmdp_invalidate.
 386          *
 387          * Huge page modification can be caught here too.
 388          */
 389         if (pmd_is_serializing(pmd))
 390                 return NULL;
 391 #endif
 392 
 393         if (pmd_trans_huge(pmd) || pmd_devmap(pmd)) {
 394                 if (is_thp)
 395                         *is_thp = true;
 396                 ret_pte = (pte_t *)pmdp;
 397                 goto out;
 398         }
 399 
 400         if (pmd_is_leaf(pmd)) {
 401                 ret_pte = (pte_t *)pmdp;
 402                 goto out;
 403         }
 404 
 405         if (is_hugepd(__hugepd(pmd_val(pmd)))) {
 406                 hpdp = (hugepd_t *)&pmd;
 407                 goto out_huge;
 408         }
 409 
 410         return pte_offset_kernel(&pmd, ea);
 411 
 412 out_huge:
 413         if (!hpdp)
 414                 return NULL;
 415 
 416         ret_pte = hugepte_offset(*hpdp, ea, pdshift);
 417         pdshift = hugepd_shift(*hpdp);
 418 out:
 419         if (hpage_shift)
 420                 *hpage_shift = pdshift;
 421         return ret_pte;
 422 }
 423 EXPORT_SYMBOL_GPL(__find_linux_pte);

/* [<][>][^][v][top][bottom][index][help] */