1/* 2 * mm/userfaultfd.c 3 * 4 * Copyright (C) 2015 Red Hat, Inc. 5 * 6 * This work is licensed under the terms of the GNU GPL, version 2. See 7 * the COPYING file in the top-level directory. 8 */ 9 10#include <linux/mm.h> 11#include <linux/pagemap.h> 12#include <linux/rmap.h> 13#include <linux/swap.h> 14#include <linux/swapops.h> 15#include <linux/userfaultfd_k.h> 16#include <linux/mmu_notifier.h> 17#include <asm/tlbflush.h> 18#include "internal.h" 19 20static int mcopy_atomic_pte(struct mm_struct *dst_mm, 21 pmd_t *dst_pmd, 22 struct vm_area_struct *dst_vma, 23 unsigned long dst_addr, 24 unsigned long src_addr, 25 struct page **pagep) 26{ 27 struct mem_cgroup *memcg; 28 pte_t _dst_pte, *dst_pte; 29 spinlock_t *ptl; 30 void *page_kaddr; 31 int ret; 32 struct page *page; 33 34 if (!*pagep) { 35 ret = -ENOMEM; 36 page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, dst_vma, dst_addr); 37 if (!page) 38 goto out; 39 40 page_kaddr = kmap_atomic(page); 41 ret = copy_from_user(page_kaddr, 42 (const void __user *) src_addr, 43 PAGE_SIZE); 44 kunmap_atomic(page_kaddr); 45 46 /* fallback to copy_from_user outside mmap_sem */ 47 if (unlikely(ret)) { 48 ret = -EFAULT; 49 *pagep = page; 50 /* don't free the page */ 51 goto out; 52 } 53 } else { 54 page = *pagep; 55 *pagep = NULL; 56 } 57 58 /* 59 * The memory barrier inside __SetPageUptodate makes sure that 60 * preceeding stores to the page contents become visible before 61 * the set_pte_at() write. 62 */ 63 __SetPageUptodate(page); 64 65 ret = -ENOMEM; 66 if (mem_cgroup_try_charge(page, dst_mm, GFP_KERNEL, &memcg)) 67 goto out_release; 68 69 _dst_pte = mk_pte(page, dst_vma->vm_page_prot); 70 if (dst_vma->vm_flags & VM_WRITE) 71 _dst_pte = pte_mkwrite(pte_mkdirty(_dst_pte)); 72 73 ret = -EEXIST; 74 dst_pte = pte_offset_map_lock(dst_mm, dst_pmd, dst_addr, &ptl); 75 if (!pte_none(*dst_pte)) 76 goto out_release_uncharge_unlock; 77 78 inc_mm_counter(dst_mm, MM_ANONPAGES); 79 page_add_new_anon_rmap(page, dst_vma, dst_addr); 80 mem_cgroup_commit_charge(page, memcg, false); 81 lru_cache_add_active_or_unevictable(page, dst_vma); 82 83 set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte); 84 85 /* No need to invalidate - it was non-present before */ 86 update_mmu_cache(dst_vma, dst_addr, dst_pte); 87 88 pte_unmap_unlock(dst_pte, ptl); 89 ret = 0; 90out: 91 return ret; 92out_release_uncharge_unlock: 93 pte_unmap_unlock(dst_pte, ptl); 94 mem_cgroup_cancel_charge(page, memcg); 95out_release: 96 page_cache_release(page); 97 goto out; 98} 99 100static int mfill_zeropage_pte(struct mm_struct *dst_mm, 101 pmd_t *dst_pmd, 102 struct vm_area_struct *dst_vma, 103 unsigned long dst_addr) 104{ 105 pte_t _dst_pte, *dst_pte; 106 spinlock_t *ptl; 107 int ret; 108 109 _dst_pte = pte_mkspecial(pfn_pte(my_zero_pfn(dst_addr), 110 dst_vma->vm_page_prot)); 111 ret = -EEXIST; 112 dst_pte = pte_offset_map_lock(dst_mm, dst_pmd, dst_addr, &ptl); 113 if (!pte_none(*dst_pte)) 114 goto out_unlock; 115 set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte); 116 /* No need to invalidate - it was non-present before */ 117 update_mmu_cache(dst_vma, dst_addr, dst_pte); 118 ret = 0; 119out_unlock: 120 pte_unmap_unlock(dst_pte, ptl); 121 return ret; 122} 123 124static pmd_t *mm_alloc_pmd(struct mm_struct *mm, unsigned long address) 125{ 126 pgd_t *pgd; 127 pud_t *pud; 128 pmd_t *pmd = NULL; 129 130 pgd = pgd_offset(mm, address); 131 pud = pud_alloc(mm, pgd, address); 132 if (pud) 133 /* 134 * Note that we didn't run this because the pmd was 135 * missing, the *pmd may be already established and in 136 * turn it may also be a trans_huge_pmd. 137 */ 138 pmd = pmd_alloc(mm, pud, address); 139 return pmd; 140} 141 142static __always_inline ssize_t __mcopy_atomic(struct mm_struct *dst_mm, 143 unsigned long dst_start, 144 unsigned long src_start, 145 unsigned long len, 146 bool zeropage) 147{ 148 struct vm_area_struct *dst_vma; 149 ssize_t err; 150 pmd_t *dst_pmd; 151 unsigned long src_addr, dst_addr; 152 long copied; 153 struct page *page; 154 155 /* 156 * Sanitize the command parameters: 157 */ 158 BUG_ON(dst_start & ~PAGE_MASK); 159 BUG_ON(len & ~PAGE_MASK); 160 161 /* Does the address range wrap, or is the span zero-sized? */ 162 BUG_ON(src_start + len <= src_start); 163 BUG_ON(dst_start + len <= dst_start); 164 165 src_addr = src_start; 166 dst_addr = dst_start; 167 copied = 0; 168 page = NULL; 169retry: 170 down_read(&dst_mm->mmap_sem); 171 172 /* 173 * Make sure the vma is not shared, that the dst range is 174 * both valid and fully within a single existing vma. 175 */ 176 err = -EINVAL; 177 dst_vma = find_vma(dst_mm, dst_start); 178 if (!dst_vma || (dst_vma->vm_flags & VM_SHARED)) 179 goto out_unlock; 180 if (dst_start < dst_vma->vm_start || 181 dst_start + len > dst_vma->vm_end) 182 goto out_unlock; 183 184 /* 185 * Be strict and only allow __mcopy_atomic on userfaultfd 186 * registered ranges to prevent userland errors going 187 * unnoticed. As far as the VM consistency is concerned, it 188 * would be perfectly safe to remove this check, but there's 189 * no useful usage for __mcopy_atomic ouside of userfaultfd 190 * registered ranges. This is after all why these are ioctls 191 * belonging to the userfaultfd and not syscalls. 192 */ 193 if (!dst_vma->vm_userfaultfd_ctx.ctx) 194 goto out_unlock; 195 196 /* 197 * FIXME: only allow copying on anonymous vmas, tmpfs should 198 * be added. 199 */ 200 if (dst_vma->vm_ops) 201 goto out_unlock; 202 203 /* 204 * Ensure the dst_vma has a anon_vma or this page 205 * would get a NULL anon_vma when moved in the 206 * dst_vma. 207 */ 208 err = -ENOMEM; 209 if (unlikely(anon_vma_prepare(dst_vma))) 210 goto out_unlock; 211 212 while (src_addr < src_start + len) { 213 pmd_t dst_pmdval; 214 215 BUG_ON(dst_addr >= dst_start + len); 216 217 dst_pmd = mm_alloc_pmd(dst_mm, dst_addr); 218 if (unlikely(!dst_pmd)) { 219 err = -ENOMEM; 220 break; 221 } 222 223 dst_pmdval = pmd_read_atomic(dst_pmd); 224 /* 225 * If the dst_pmd is mapped as THP don't 226 * override it and just be strict. 227 */ 228 if (unlikely(pmd_trans_huge(dst_pmdval))) { 229 err = -EEXIST; 230 break; 231 } 232 if (unlikely(pmd_none(dst_pmdval)) && 233 unlikely(__pte_alloc(dst_mm, dst_vma, dst_pmd, 234 dst_addr))) { 235 err = -ENOMEM; 236 break; 237 } 238 /* If an huge pmd materialized from under us fail */ 239 if (unlikely(pmd_trans_huge(*dst_pmd))) { 240 err = -EFAULT; 241 break; 242 } 243 244 BUG_ON(pmd_none(*dst_pmd)); 245 BUG_ON(pmd_trans_huge(*dst_pmd)); 246 247 if (!zeropage) 248 err = mcopy_atomic_pte(dst_mm, dst_pmd, dst_vma, 249 dst_addr, src_addr, &page); 250 else 251 err = mfill_zeropage_pte(dst_mm, dst_pmd, dst_vma, 252 dst_addr); 253 254 cond_resched(); 255 256 if (unlikely(err == -EFAULT)) { 257 void *page_kaddr; 258 259 up_read(&dst_mm->mmap_sem); 260 BUG_ON(!page); 261 262 page_kaddr = kmap(page); 263 err = copy_from_user(page_kaddr, 264 (const void __user *) src_addr, 265 PAGE_SIZE); 266 kunmap(page); 267 if (unlikely(err)) { 268 err = -EFAULT; 269 goto out; 270 } 271 goto retry; 272 } else 273 BUG_ON(page); 274 275 if (!err) { 276 dst_addr += PAGE_SIZE; 277 src_addr += PAGE_SIZE; 278 copied += PAGE_SIZE; 279 280 if (fatal_signal_pending(current)) 281 err = -EINTR; 282 } 283 if (err) 284 break; 285 } 286 287out_unlock: 288 up_read(&dst_mm->mmap_sem); 289out: 290 if (page) 291 page_cache_release(page); 292 BUG_ON(copied < 0); 293 BUG_ON(err > 0); 294 BUG_ON(!copied && !err); 295 return copied ? copied : err; 296} 297 298ssize_t mcopy_atomic(struct mm_struct *dst_mm, unsigned long dst_start, 299 unsigned long src_start, unsigned long len) 300{ 301 return __mcopy_atomic(dst_mm, dst_start, src_start, len, false); 302} 303 304ssize_t mfill_zeropage(struct mm_struct *dst_mm, unsigned long start, 305 unsigned long len) 306{ 307 return __mcopy_atomic(dst_mm, start, 0, len, true); 308} 309