1/*
2 * linux/ipc/shm.c
3 * Copyright (C) 1992, 1993 Krishna Balasubramanian
4 *	 Many improvements/fixes by Bruno Haible.
5 * Replaced `struct shm_desc' by `struct vm_area_struct', July 1994.
6 * Fixed the shm swap deallocation (shm_unuse()), August 1998 Andrea Arcangeli.
7 *
8 * /proc/sysvipc/shm support (c) 1999 Dragos Acostachioaie <dragos@iname.com>
9 * BIGMEM support, Andrea Arcangeli <andrea@suse.de>
10 * SMP thread shm, Jean-Luc Boyard <jean-luc.boyard@siemens.fr>
11 * HIGHMEM support, Ingo Molnar <mingo@redhat.com>
12 * Make shmmax, shmall, shmmni sysctl'able, Christoph Rohland <cr@sap.com>
13 * Shared /dev/zero support, Kanoj Sarcar <kanoj@sgi.com>
14 * Move the mm functionality over to mm/shmem.c, Christoph Rohland <cr@sap.com>
15 *
16 * support for audit of ipc object properties and permission changes
17 * Dustin Kirkland <dustin.kirkland@us.ibm.com>
18 *
19 * namespaces support
20 * OpenVZ, SWsoft Inc.
21 * Pavel Emelianov <xemul@openvz.org>
22 *
23 * Better ipc lock (kern_ipc_perm.lock) handling
24 * Davidlohr Bueso <davidlohr.bueso@hp.com>, June 2013.
25 */
26
27#include <linux/slab.h>
28#include <linux/mm.h>
29#include <linux/hugetlb.h>
30#include <linux/shm.h>
31#include <linux/init.h>
32#include <linux/file.h>
33#include <linux/mman.h>
34#include <linux/shmem_fs.h>
35#include <linux/security.h>
36#include <linux/syscalls.h>
37#include <linux/audit.h>
38#include <linux/capability.h>
39#include <linux/ptrace.h>
40#include <linux/seq_file.h>
41#include <linux/rwsem.h>
42#include <linux/nsproxy.h>
43#include <linux/mount.h>
44#include <linux/ipc_namespace.h>
45
46#include <linux/uaccess.h>
47
48#include "util.h"
49
50struct shm_file_data {
51	int id;
52	struct ipc_namespace *ns;
53	struct file *file;
54	const struct vm_operations_struct *vm_ops;
55};
56
57#define shm_file_data(file) (*((struct shm_file_data **)&(file)->private_data))
58
59static const struct file_operations shm_file_operations;
60static const struct vm_operations_struct shm_vm_ops;
61
62#define shm_ids(ns)	((ns)->ids[IPC_SHM_IDS])
63
64#define shm_unlock(shp)			\
65	ipc_unlock(&(shp)->shm_perm)
66
67static int newseg(struct ipc_namespace *, struct ipc_params *);
68static void shm_open(struct vm_area_struct *vma);
69static void shm_close(struct vm_area_struct *vma);
70static void shm_destroy(struct ipc_namespace *ns, struct shmid_kernel *shp);
71#ifdef CONFIG_PROC_FS
72static int sysvipc_shm_proc_show(struct seq_file *s, void *it);
73#endif
74
75void shm_init_ns(struct ipc_namespace *ns)
76{
77	ns->shm_ctlmax = SHMMAX;
78	ns->shm_ctlall = SHMALL;
79	ns->shm_ctlmni = SHMMNI;
80	ns->shm_rmid_forced = 0;
81	ns->shm_tot = 0;
82	ipc_init_ids(&shm_ids(ns));
83}
84
85/*
86 * Called with shm_ids.rwsem (writer) and the shp structure locked.
87 * Only shm_ids.rwsem remains locked on exit.
88 */
89static void do_shm_rmid(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp)
90{
91	struct shmid_kernel *shp;
92	shp = container_of(ipcp, struct shmid_kernel, shm_perm);
93
94	if (shp->shm_nattch) {
95		shp->shm_perm.mode |= SHM_DEST;
96		/* Do not find it any more */
97		shp->shm_perm.key = IPC_PRIVATE;
98		shm_unlock(shp);
99	} else
100		shm_destroy(ns, shp);
101}
102
103#ifdef CONFIG_IPC_NS
104void shm_exit_ns(struct ipc_namespace *ns)
105{
106	free_ipcs(ns, &shm_ids(ns), do_shm_rmid);
107	idr_destroy(&ns->ids[IPC_SHM_IDS].ipcs_idr);
108}
109#endif
110
111static int __init ipc_ns_init(void)
112{
113	shm_init_ns(&init_ipc_ns);
114	return 0;
115}
116
117pure_initcall(ipc_ns_init);
118
119void __init shm_init(void)
120{
121	ipc_init_proc_interface("sysvipc/shm",
122#if BITS_PER_LONG <= 32
123				"       key      shmid perms       size  cpid  lpid nattch   uid   gid  cuid  cgid      atime      dtime      ctime        rss       swap\n",
124#else
125				"       key      shmid perms                  size  cpid  lpid nattch   uid   gid  cuid  cgid      atime      dtime      ctime                   rss                  swap\n",
126#endif
127				IPC_SHM_IDS, sysvipc_shm_proc_show);
128}
129
130static inline struct shmid_kernel *shm_obtain_object(struct ipc_namespace *ns, int id)
131{
132	struct kern_ipc_perm *ipcp = ipc_obtain_object_idr(&shm_ids(ns), id);
133
134	if (IS_ERR(ipcp))
135		return ERR_CAST(ipcp);
136
137	return container_of(ipcp, struct shmid_kernel, shm_perm);
138}
139
140static inline struct shmid_kernel *shm_obtain_object_check(struct ipc_namespace *ns, int id)
141{
142	struct kern_ipc_perm *ipcp = ipc_obtain_object_check(&shm_ids(ns), id);
143
144	if (IS_ERR(ipcp))
145		return ERR_CAST(ipcp);
146
147	return container_of(ipcp, struct shmid_kernel, shm_perm);
148}
149
150/*
151 * shm_lock_(check_) routines are called in the paths where the rwsem
152 * is not necessarily held.
153 */
154static inline struct shmid_kernel *shm_lock(struct ipc_namespace *ns, int id)
155{
156	struct kern_ipc_perm *ipcp = ipc_lock(&shm_ids(ns), id);
157
158	/*
159	 * Callers of shm_lock() must validate the status of the returned ipc
160	 * object pointer (as returned by ipc_lock()), and error out as
161	 * appropriate.
162	 */
163	if (IS_ERR(ipcp))
164		return (void *)ipcp;
165	return container_of(ipcp, struct shmid_kernel, shm_perm);
166}
167
168static inline void shm_lock_by_ptr(struct shmid_kernel *ipcp)
169{
170	rcu_read_lock();
171	ipc_lock_object(&ipcp->shm_perm);
172}
173
174static void shm_rcu_free(struct rcu_head *head)
175{
176	struct ipc_rcu *p = container_of(head, struct ipc_rcu, rcu);
177	struct shmid_kernel *shp = ipc_rcu_to_struct(p);
178
179	security_shm_free(shp);
180	ipc_rcu_free(head);
181}
182
183static inline void shm_rmid(struct ipc_namespace *ns, struct shmid_kernel *s)
184{
185	list_del(&s->shm_clist);
186	ipc_rmid(&shm_ids(ns), &s->shm_perm);
187}
188
189
190static int __shm_open(struct vm_area_struct *vma)
191{
192	struct file *file = vma->vm_file;
193	struct shm_file_data *sfd = shm_file_data(file);
194	struct shmid_kernel *shp;
195
196	shp = shm_lock(sfd->ns, sfd->id);
197
198	if (IS_ERR(shp))
199		return PTR_ERR(shp);
200
201	shp->shm_atim = get_seconds();
202	shp->shm_lprid = task_tgid_vnr(current);
203	shp->shm_nattch++;
204	shm_unlock(shp);
205	return 0;
206}
207
208/* This is called by fork, once for every shm attach. */
209static void shm_open(struct vm_area_struct *vma)
210{
211	int err = __shm_open(vma);
212	/*
213	 * We raced in the idr lookup or with shm_destroy().
214	 * Either way, the ID is busted.
215	 */
216	WARN_ON_ONCE(err);
217}
218
219/*
220 * shm_destroy - free the struct shmid_kernel
221 *
222 * @ns: namespace
223 * @shp: struct to free
224 *
225 * It has to be called with shp and shm_ids.rwsem (writer) locked,
226 * but returns with shp unlocked and freed.
227 */
228static void shm_destroy(struct ipc_namespace *ns, struct shmid_kernel *shp)
229{
230	struct file *shm_file;
231
232	shm_file = shp->shm_file;
233	shp->shm_file = NULL;
234	ns->shm_tot -= (shp->shm_segsz + PAGE_SIZE - 1) >> PAGE_SHIFT;
235	shm_rmid(ns, shp);
236	shm_unlock(shp);
237	if (!is_file_hugepages(shm_file))
238		shmem_lock(shm_file, 0, shp->mlock_user);
239	else if (shp->mlock_user)
240		user_shm_unlock(i_size_read(file_inode(shm_file)),
241				shp->mlock_user);
242	fput(shm_file);
243	ipc_rcu_putref(shp, shm_rcu_free);
244}
245
246/*
247 * shm_may_destroy - identifies whether shm segment should be destroyed now
248 *
249 * Returns true if and only if there are no active users of the segment and
250 * one of the following is true:
251 *
252 * 1) shmctl(id, IPC_RMID, NULL) was called for this shp
253 *
254 * 2) sysctl kernel.shm_rmid_forced is set to 1.
255 */
256static bool shm_may_destroy(struct ipc_namespace *ns, struct shmid_kernel *shp)
257{
258	return (shp->shm_nattch == 0) &&
259	       (ns->shm_rmid_forced ||
260		(shp->shm_perm.mode & SHM_DEST));
261}
262
263/*
264 * remove the attach descriptor vma.
265 * free memory for segment if it is marked destroyed.
266 * The descriptor has already been removed from the current->mm->mmap list
267 * and will later be kfree()d.
268 */
269static void shm_close(struct vm_area_struct *vma)
270{
271	struct file *file = vma->vm_file;
272	struct shm_file_data *sfd = shm_file_data(file);
273	struct shmid_kernel *shp;
274	struct ipc_namespace *ns = sfd->ns;
275
276	down_write(&shm_ids(ns).rwsem);
277	/* remove from the list of attaches of the shm segment */
278	shp = shm_lock(ns, sfd->id);
279
280	/*
281	 * We raced in the idr lookup or with shm_destroy().
282	 * Either way, the ID is busted.
283	 */
284	if (WARN_ON_ONCE(IS_ERR(shp)))
285		goto done; /* no-op */
286
287	shp->shm_lprid = task_tgid_vnr(current);
288	shp->shm_dtim = get_seconds();
289	shp->shm_nattch--;
290	if (shm_may_destroy(ns, shp))
291		shm_destroy(ns, shp);
292	else
293		shm_unlock(shp);
294done:
295	up_write(&shm_ids(ns).rwsem);
296}
297
298/* Called with ns->shm_ids(ns).rwsem locked */
299static int shm_try_destroy_orphaned(int id, void *p, void *data)
300{
301	struct ipc_namespace *ns = data;
302	struct kern_ipc_perm *ipcp = p;
303	struct shmid_kernel *shp = container_of(ipcp, struct shmid_kernel, shm_perm);
304
305	/*
306	 * We want to destroy segments without users and with already
307	 * exit'ed originating process.
308	 *
309	 * As shp->* are changed under rwsem, it's safe to skip shp locking.
310	 */
311	if (shp->shm_creator != NULL)
312		return 0;
313
314	if (shm_may_destroy(ns, shp)) {
315		shm_lock_by_ptr(shp);
316		shm_destroy(ns, shp);
317	}
318	return 0;
319}
320
321void shm_destroy_orphaned(struct ipc_namespace *ns)
322{
323	down_write(&shm_ids(ns).rwsem);
324	if (shm_ids(ns).in_use)
325		idr_for_each(&shm_ids(ns).ipcs_idr, &shm_try_destroy_orphaned, ns);
326	up_write(&shm_ids(ns).rwsem);
327}
328
329/* Locking assumes this will only be called with task == current */
330void exit_shm(struct task_struct *task)
331{
332	struct ipc_namespace *ns = task->nsproxy->ipc_ns;
333	struct shmid_kernel *shp, *n;
334
335	if (list_empty(&task->sysvshm.shm_clist))
336		return;
337
338	/*
339	 * If kernel.shm_rmid_forced is not set then only keep track of
340	 * which shmids are orphaned, so that a later set of the sysctl
341	 * can clean them up.
342	 */
343	if (!ns->shm_rmid_forced) {
344		down_read(&shm_ids(ns).rwsem);
345		list_for_each_entry(shp, &task->sysvshm.shm_clist, shm_clist)
346			shp->shm_creator = NULL;
347		/*
348		 * Only under read lock but we are only called on current
349		 * so no entry on the list will be shared.
350		 */
351		list_del(&task->sysvshm.shm_clist);
352		up_read(&shm_ids(ns).rwsem);
353		return;
354	}
355
356	/*
357	 * Destroy all already created segments, that were not yet mapped,
358	 * and mark any mapped as orphan to cover the sysctl toggling.
359	 * Destroy is skipped if shm_may_destroy() returns false.
360	 */
361	down_write(&shm_ids(ns).rwsem);
362	list_for_each_entry_safe(shp, n, &task->sysvshm.shm_clist, shm_clist) {
363		shp->shm_creator = NULL;
364
365		if (shm_may_destroy(ns, shp)) {
366			shm_lock_by_ptr(shp);
367			shm_destroy(ns, shp);
368		}
369	}
370
371	/* Remove the list head from any segments still attached. */
372	list_del(&task->sysvshm.shm_clist);
373	up_write(&shm_ids(ns).rwsem);
374}
375
376static int shm_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
377{
378	struct file *file = vma->vm_file;
379	struct shm_file_data *sfd = shm_file_data(file);
380
381	return sfd->vm_ops->fault(vma, vmf);
382}
383
384#ifdef CONFIG_NUMA
385static int shm_set_policy(struct vm_area_struct *vma, struct mempolicy *new)
386{
387	struct file *file = vma->vm_file;
388	struct shm_file_data *sfd = shm_file_data(file);
389	int err = 0;
390	if (sfd->vm_ops->set_policy)
391		err = sfd->vm_ops->set_policy(vma, new);
392	return err;
393}
394
395static struct mempolicy *shm_get_policy(struct vm_area_struct *vma,
396					unsigned long addr)
397{
398	struct file *file = vma->vm_file;
399	struct shm_file_data *sfd = shm_file_data(file);
400	struct mempolicy *pol = NULL;
401
402	if (sfd->vm_ops->get_policy)
403		pol = sfd->vm_ops->get_policy(vma, addr);
404	else if (vma->vm_policy)
405		pol = vma->vm_policy;
406
407	return pol;
408}
409#endif
410
411static int shm_mmap(struct file *file, struct vm_area_struct *vma)
412{
413	struct shm_file_data *sfd = shm_file_data(file);
414	int ret;
415
416	/*
417	 * In case of remap_file_pages() emulation, the file can represent
418	 * removed IPC ID: propogate shm_lock() error to caller.
419	 */
420	ret =__shm_open(vma);
421	if (ret)
422		return ret;
423
424	ret = sfd->file->f_op->mmap(sfd->file, vma);
425	if (ret) {
426		shm_close(vma);
427		return ret;
428	}
429	sfd->vm_ops = vma->vm_ops;
430#ifdef CONFIG_MMU
431	WARN_ON(!sfd->vm_ops->fault);
432#endif
433	vma->vm_ops = &shm_vm_ops;
434	return 0;
435}
436
437static int shm_release(struct inode *ino, struct file *file)
438{
439	struct shm_file_data *sfd = shm_file_data(file);
440
441	put_ipc_ns(sfd->ns);
442	shm_file_data(file) = NULL;
443	kfree(sfd);
444	return 0;
445}
446
447static int shm_fsync(struct file *file, loff_t start, loff_t end, int datasync)
448{
449	struct shm_file_data *sfd = shm_file_data(file);
450
451	if (!sfd->file->f_op->fsync)
452		return -EINVAL;
453	return sfd->file->f_op->fsync(sfd->file, start, end, datasync);
454}
455
456static long shm_fallocate(struct file *file, int mode, loff_t offset,
457			  loff_t len)
458{
459	struct shm_file_data *sfd = shm_file_data(file);
460
461	if (!sfd->file->f_op->fallocate)
462		return -EOPNOTSUPP;
463	return sfd->file->f_op->fallocate(file, mode, offset, len);
464}
465
466static unsigned long shm_get_unmapped_area(struct file *file,
467	unsigned long addr, unsigned long len, unsigned long pgoff,
468	unsigned long flags)
469{
470	struct shm_file_data *sfd = shm_file_data(file);
471	return sfd->file->f_op->get_unmapped_area(sfd->file, addr, len,
472						pgoff, flags);
473}
474
475static const struct file_operations shm_file_operations = {
476	.mmap		= shm_mmap,
477	.fsync		= shm_fsync,
478	.release	= shm_release,
479#ifndef CONFIG_MMU
480	.get_unmapped_area	= shm_get_unmapped_area,
481#endif
482	.llseek		= noop_llseek,
483	.fallocate	= shm_fallocate,
484};
485
486static const struct file_operations shm_file_operations_huge = {
487	.mmap		= shm_mmap,
488	.fsync		= shm_fsync,
489	.release	= shm_release,
490	.get_unmapped_area	= shm_get_unmapped_area,
491	.llseek		= noop_llseek,
492	.fallocate	= shm_fallocate,
493};
494
495int is_file_shm_hugepages(struct file *file)
496{
497	return file->f_op == &shm_file_operations_huge;
498}
499
500static const struct vm_operations_struct shm_vm_ops = {
501	.open	= shm_open,	/* callback for a new vm-area open */
502	.close	= shm_close,	/* callback for when the vm-area is released */
503	.fault	= shm_fault,
504#if defined(CONFIG_NUMA)
505	.set_policy = shm_set_policy,
506	.get_policy = shm_get_policy,
507#endif
508};
509
510/**
511 * newseg - Create a new shared memory segment
512 * @ns: namespace
513 * @params: ptr to the structure that contains key, size and shmflg
514 *
515 * Called with shm_ids.rwsem held as a writer.
516 */
517static int newseg(struct ipc_namespace *ns, struct ipc_params *params)
518{
519	key_t key = params->key;
520	int shmflg = params->flg;
521	size_t size = params->u.size;
522	int error;
523	struct shmid_kernel *shp;
524	size_t numpages = (size + PAGE_SIZE - 1) >> PAGE_SHIFT;
525	struct file *file;
526	char name[13];
527	int id;
528	vm_flags_t acctflag = 0;
529
530	if (size < SHMMIN || size > ns->shm_ctlmax)
531		return -EINVAL;
532
533	if (numpages << PAGE_SHIFT < size)
534		return -ENOSPC;
535
536	if (ns->shm_tot + numpages < ns->shm_tot ||
537			ns->shm_tot + numpages > ns->shm_ctlall)
538		return -ENOSPC;
539
540	shp = ipc_rcu_alloc(sizeof(*shp));
541	if (!shp)
542		return -ENOMEM;
543
544	shp->shm_perm.key = key;
545	shp->shm_perm.mode = (shmflg & S_IRWXUGO);
546	shp->mlock_user = NULL;
547
548	shp->shm_perm.security = NULL;
549	error = security_shm_alloc(shp);
550	if (error) {
551		ipc_rcu_putref(shp, ipc_rcu_free);
552		return error;
553	}
554
555	sprintf(name, "SYSV%08x", key);
556	if (shmflg & SHM_HUGETLB) {
557		struct hstate *hs;
558		size_t hugesize;
559
560		hs = hstate_sizelog((shmflg >> SHM_HUGE_SHIFT) & SHM_HUGE_MASK);
561		if (!hs) {
562			error = -EINVAL;
563			goto no_file;
564		}
565		hugesize = ALIGN(size, huge_page_size(hs));
566
567		/* hugetlb_file_setup applies strict accounting */
568		if (shmflg & SHM_NORESERVE)
569			acctflag = VM_NORESERVE;
570		file = hugetlb_file_setup(name, hugesize, acctflag,
571				  &shp->mlock_user, HUGETLB_SHMFS_INODE,
572				(shmflg >> SHM_HUGE_SHIFT) & SHM_HUGE_MASK);
573	} else {
574		/*
575		 * Do not allow no accounting for OVERCOMMIT_NEVER, even
576		 * if it's asked for.
577		 */
578		if  ((shmflg & SHM_NORESERVE) &&
579				sysctl_overcommit_memory != OVERCOMMIT_NEVER)
580			acctflag = VM_NORESERVE;
581		file = shmem_kernel_file_setup(name, size, acctflag);
582	}
583	error = PTR_ERR(file);
584	if (IS_ERR(file))
585		goto no_file;
586
587	shp->shm_cprid = task_tgid_vnr(current);
588	shp->shm_lprid = 0;
589	shp->shm_atim = shp->shm_dtim = 0;
590	shp->shm_ctim = get_seconds();
591	shp->shm_segsz = size;
592	shp->shm_nattch = 0;
593	shp->shm_file = file;
594	shp->shm_creator = current;
595
596	id = ipc_addid(&shm_ids(ns), &shp->shm_perm, ns->shm_ctlmni);
597	if (id < 0) {
598		error = id;
599		goto no_id;
600	}
601
602	list_add(&shp->shm_clist, &current->sysvshm.shm_clist);
603
604	/*
605	 * shmid gets reported as "inode#" in /proc/pid/maps.
606	 * proc-ps tools use this. Changing this will break them.
607	 */
608	file_inode(file)->i_ino = shp->shm_perm.id;
609
610	ns->shm_tot += numpages;
611	error = shp->shm_perm.id;
612
613	ipc_unlock_object(&shp->shm_perm);
614	rcu_read_unlock();
615	return error;
616
617no_id:
618	if (is_file_hugepages(file) && shp->mlock_user)
619		user_shm_unlock(size, shp->mlock_user);
620	fput(file);
621no_file:
622	ipc_rcu_putref(shp, shm_rcu_free);
623	return error;
624}
625
626/*
627 * Called with shm_ids.rwsem and ipcp locked.
628 */
629static inline int shm_security(struct kern_ipc_perm *ipcp, int shmflg)
630{
631	struct shmid_kernel *shp;
632
633	shp = container_of(ipcp, struct shmid_kernel, shm_perm);
634	return security_shm_associate(shp, shmflg);
635}
636
637/*
638 * Called with shm_ids.rwsem and ipcp locked.
639 */
640static inline int shm_more_checks(struct kern_ipc_perm *ipcp,
641				struct ipc_params *params)
642{
643	struct shmid_kernel *shp;
644
645	shp = container_of(ipcp, struct shmid_kernel, shm_perm);
646	if (shp->shm_segsz < params->u.size)
647		return -EINVAL;
648
649	return 0;
650}
651
652SYSCALL_DEFINE3(shmget, key_t, key, size_t, size, int, shmflg)
653{
654	struct ipc_namespace *ns;
655	static const struct ipc_ops shm_ops = {
656		.getnew = newseg,
657		.associate = shm_security,
658		.more_checks = shm_more_checks,
659	};
660	struct ipc_params shm_params;
661
662	ns = current->nsproxy->ipc_ns;
663
664	shm_params.key = key;
665	shm_params.flg = shmflg;
666	shm_params.u.size = size;
667
668	return ipcget(ns, &shm_ids(ns), &shm_ops, &shm_params);
669}
670
671static inline unsigned long copy_shmid_to_user(void __user *buf, struct shmid64_ds *in, int version)
672{
673	switch (version) {
674	case IPC_64:
675		return copy_to_user(buf, in, sizeof(*in));
676	case IPC_OLD:
677	    {
678		struct shmid_ds out;
679
680		memset(&out, 0, sizeof(out));
681		ipc64_perm_to_ipc_perm(&in->shm_perm, &out.shm_perm);
682		out.shm_segsz	= in->shm_segsz;
683		out.shm_atime	= in->shm_atime;
684		out.shm_dtime	= in->shm_dtime;
685		out.shm_ctime	= in->shm_ctime;
686		out.shm_cpid	= in->shm_cpid;
687		out.shm_lpid	= in->shm_lpid;
688		out.shm_nattch	= in->shm_nattch;
689
690		return copy_to_user(buf, &out, sizeof(out));
691	    }
692	default:
693		return -EINVAL;
694	}
695}
696
697static inline unsigned long
698copy_shmid_from_user(struct shmid64_ds *out, void __user *buf, int version)
699{
700	switch (version) {
701	case IPC_64:
702		if (copy_from_user(out, buf, sizeof(*out)))
703			return -EFAULT;
704		return 0;
705	case IPC_OLD:
706	    {
707		struct shmid_ds tbuf_old;
708
709		if (copy_from_user(&tbuf_old, buf, sizeof(tbuf_old)))
710			return -EFAULT;
711
712		out->shm_perm.uid	= tbuf_old.shm_perm.uid;
713		out->shm_perm.gid	= tbuf_old.shm_perm.gid;
714		out->shm_perm.mode	= tbuf_old.shm_perm.mode;
715
716		return 0;
717	    }
718	default:
719		return -EINVAL;
720	}
721}
722
723static inline unsigned long copy_shminfo_to_user(void __user *buf, struct shminfo64 *in, int version)
724{
725	switch (version) {
726	case IPC_64:
727		return copy_to_user(buf, in, sizeof(*in));
728	case IPC_OLD:
729	    {
730		struct shminfo out;
731
732		if (in->shmmax > INT_MAX)
733			out.shmmax = INT_MAX;
734		else
735			out.shmmax = (int)in->shmmax;
736
737		out.shmmin	= in->shmmin;
738		out.shmmni	= in->shmmni;
739		out.shmseg	= in->shmseg;
740		out.shmall	= in->shmall;
741
742		return copy_to_user(buf, &out, sizeof(out));
743	    }
744	default:
745		return -EINVAL;
746	}
747}
748
749/*
750 * Calculate and add used RSS and swap pages of a shm.
751 * Called with shm_ids.rwsem held as a reader
752 */
753static void shm_add_rss_swap(struct shmid_kernel *shp,
754	unsigned long *rss_add, unsigned long *swp_add)
755{
756	struct inode *inode;
757
758	inode = file_inode(shp->shm_file);
759
760	if (is_file_hugepages(shp->shm_file)) {
761		struct address_space *mapping = inode->i_mapping;
762		struct hstate *h = hstate_file(shp->shm_file);
763		*rss_add += pages_per_huge_page(h) * mapping->nrpages;
764	} else {
765#ifdef CONFIG_SHMEM
766		struct shmem_inode_info *info = SHMEM_I(inode);
767		spin_lock(&info->lock);
768		*rss_add += inode->i_mapping->nrpages;
769		*swp_add += info->swapped;
770		spin_unlock(&info->lock);
771#else
772		*rss_add += inode->i_mapping->nrpages;
773#endif
774	}
775}
776
777/*
778 * Called with shm_ids.rwsem held as a reader
779 */
780static void shm_get_stat(struct ipc_namespace *ns, unsigned long *rss,
781		unsigned long *swp)
782{
783	int next_id;
784	int total, in_use;
785
786	*rss = 0;
787	*swp = 0;
788
789	in_use = shm_ids(ns).in_use;
790
791	for (total = 0, next_id = 0; total < in_use; next_id++) {
792		struct kern_ipc_perm *ipc;
793		struct shmid_kernel *shp;
794
795		ipc = idr_find(&shm_ids(ns).ipcs_idr, next_id);
796		if (ipc == NULL)
797			continue;
798		shp = container_of(ipc, struct shmid_kernel, shm_perm);
799
800		shm_add_rss_swap(shp, rss, swp);
801
802		total++;
803	}
804}
805
806/*
807 * This function handles some shmctl commands which require the rwsem
808 * to be held in write mode.
809 * NOTE: no locks must be held, the rwsem is taken inside this function.
810 */
811static int shmctl_down(struct ipc_namespace *ns, int shmid, int cmd,
812		       struct shmid_ds __user *buf, int version)
813{
814	struct kern_ipc_perm *ipcp;
815	struct shmid64_ds shmid64;
816	struct shmid_kernel *shp;
817	int err;
818
819	if (cmd == IPC_SET) {
820		if (copy_shmid_from_user(&shmid64, buf, version))
821			return -EFAULT;
822	}
823
824	down_write(&shm_ids(ns).rwsem);
825	rcu_read_lock();
826
827	ipcp = ipcctl_pre_down_nolock(ns, &shm_ids(ns), shmid, cmd,
828				      &shmid64.shm_perm, 0);
829	if (IS_ERR(ipcp)) {
830		err = PTR_ERR(ipcp);
831		goto out_unlock1;
832	}
833
834	shp = container_of(ipcp, struct shmid_kernel, shm_perm);
835
836	err = security_shm_shmctl(shp, cmd);
837	if (err)
838		goto out_unlock1;
839
840	switch (cmd) {
841	case IPC_RMID:
842		ipc_lock_object(&shp->shm_perm);
843		/* do_shm_rmid unlocks the ipc object and rcu */
844		do_shm_rmid(ns, ipcp);
845		goto out_up;
846	case IPC_SET:
847		ipc_lock_object(&shp->shm_perm);
848		err = ipc_update_perm(&shmid64.shm_perm, ipcp);
849		if (err)
850			goto out_unlock0;
851		shp->shm_ctim = get_seconds();
852		break;
853	default:
854		err = -EINVAL;
855		goto out_unlock1;
856	}
857
858out_unlock0:
859	ipc_unlock_object(&shp->shm_perm);
860out_unlock1:
861	rcu_read_unlock();
862out_up:
863	up_write(&shm_ids(ns).rwsem);
864	return err;
865}
866
867static int shmctl_nolock(struct ipc_namespace *ns, int shmid,
868			 int cmd, int version, void __user *buf)
869{
870	int err;
871	struct shmid_kernel *shp;
872
873	/* preliminary security checks for *_INFO */
874	if (cmd == IPC_INFO || cmd == SHM_INFO) {
875		err = security_shm_shmctl(NULL, cmd);
876		if (err)
877			return err;
878	}
879
880	switch (cmd) {
881	case IPC_INFO:
882	{
883		struct shminfo64 shminfo;
884
885		memset(&shminfo, 0, sizeof(shminfo));
886		shminfo.shmmni = shminfo.shmseg = ns->shm_ctlmni;
887		shminfo.shmmax = ns->shm_ctlmax;
888		shminfo.shmall = ns->shm_ctlall;
889
890		shminfo.shmmin = SHMMIN;
891		if (copy_shminfo_to_user(buf, &shminfo, version))
892			return -EFAULT;
893
894		down_read(&shm_ids(ns).rwsem);
895		err = ipc_get_maxid(&shm_ids(ns));
896		up_read(&shm_ids(ns).rwsem);
897
898		if (err < 0)
899			err = 0;
900		goto out;
901	}
902	case SHM_INFO:
903	{
904		struct shm_info shm_info;
905
906		memset(&shm_info, 0, sizeof(shm_info));
907		down_read(&shm_ids(ns).rwsem);
908		shm_info.used_ids = shm_ids(ns).in_use;
909		shm_get_stat(ns, &shm_info.shm_rss, &shm_info.shm_swp);
910		shm_info.shm_tot = ns->shm_tot;
911		shm_info.swap_attempts = 0;
912		shm_info.swap_successes = 0;
913		err = ipc_get_maxid(&shm_ids(ns));
914		up_read(&shm_ids(ns).rwsem);
915		if (copy_to_user(buf, &shm_info, sizeof(shm_info))) {
916			err = -EFAULT;
917			goto out;
918		}
919
920		err = err < 0 ? 0 : err;
921		goto out;
922	}
923	case SHM_STAT:
924	case IPC_STAT:
925	{
926		struct shmid64_ds tbuf;
927		int result;
928
929		rcu_read_lock();
930		if (cmd == SHM_STAT) {
931			shp = shm_obtain_object(ns, shmid);
932			if (IS_ERR(shp)) {
933				err = PTR_ERR(shp);
934				goto out_unlock;
935			}
936			result = shp->shm_perm.id;
937		} else {
938			shp = shm_obtain_object_check(ns, shmid);
939			if (IS_ERR(shp)) {
940				err = PTR_ERR(shp);
941				goto out_unlock;
942			}
943			result = 0;
944		}
945
946		err = -EACCES;
947		if (ipcperms(ns, &shp->shm_perm, S_IRUGO))
948			goto out_unlock;
949
950		err = security_shm_shmctl(shp, cmd);
951		if (err)
952			goto out_unlock;
953
954		memset(&tbuf, 0, sizeof(tbuf));
955		kernel_to_ipc64_perm(&shp->shm_perm, &tbuf.shm_perm);
956		tbuf.shm_segsz	= shp->shm_segsz;
957		tbuf.shm_atime	= shp->shm_atim;
958		tbuf.shm_dtime	= shp->shm_dtim;
959		tbuf.shm_ctime	= shp->shm_ctim;
960		tbuf.shm_cpid	= shp->shm_cprid;
961		tbuf.shm_lpid	= shp->shm_lprid;
962		tbuf.shm_nattch	= shp->shm_nattch;
963		rcu_read_unlock();
964
965		if (copy_shmid_to_user(buf, &tbuf, version))
966			err = -EFAULT;
967		else
968			err = result;
969		goto out;
970	}
971	default:
972		return -EINVAL;
973	}
974
975out_unlock:
976	rcu_read_unlock();
977out:
978	return err;
979}
980
981SYSCALL_DEFINE3(shmctl, int, shmid, int, cmd, struct shmid_ds __user *, buf)
982{
983	struct shmid_kernel *shp;
984	int err, version;
985	struct ipc_namespace *ns;
986
987	if (cmd < 0 || shmid < 0)
988		return -EINVAL;
989
990	version = ipc_parse_version(&cmd);
991	ns = current->nsproxy->ipc_ns;
992
993	switch (cmd) {
994	case IPC_INFO:
995	case SHM_INFO:
996	case SHM_STAT:
997	case IPC_STAT:
998		return shmctl_nolock(ns, shmid, cmd, version, buf);
999	case IPC_RMID:
1000	case IPC_SET:
1001		return shmctl_down(ns, shmid, cmd, buf, version);
1002	case SHM_LOCK:
1003	case SHM_UNLOCK:
1004	{
1005		struct file *shm_file;
1006
1007		rcu_read_lock();
1008		shp = shm_obtain_object_check(ns, shmid);
1009		if (IS_ERR(shp)) {
1010			err = PTR_ERR(shp);
1011			goto out_unlock1;
1012		}
1013
1014		audit_ipc_obj(&(shp->shm_perm));
1015		err = security_shm_shmctl(shp, cmd);
1016		if (err)
1017			goto out_unlock1;
1018
1019		ipc_lock_object(&shp->shm_perm);
1020
1021		/* check if shm_destroy() is tearing down shp */
1022		if (!ipc_valid_object(&shp->shm_perm)) {
1023			err = -EIDRM;
1024			goto out_unlock0;
1025		}
1026
1027		if (!ns_capable(ns->user_ns, CAP_IPC_LOCK)) {
1028			kuid_t euid = current_euid();
1029			if (!uid_eq(euid, shp->shm_perm.uid) &&
1030			    !uid_eq(euid, shp->shm_perm.cuid)) {
1031				err = -EPERM;
1032				goto out_unlock0;
1033			}
1034			if (cmd == SHM_LOCK && !rlimit(RLIMIT_MEMLOCK)) {
1035				err = -EPERM;
1036				goto out_unlock0;
1037			}
1038		}
1039
1040		shm_file = shp->shm_file;
1041		if (is_file_hugepages(shm_file))
1042			goto out_unlock0;
1043
1044		if (cmd == SHM_LOCK) {
1045			struct user_struct *user = current_user();
1046			err = shmem_lock(shm_file, 1, user);
1047			if (!err && !(shp->shm_perm.mode & SHM_LOCKED)) {
1048				shp->shm_perm.mode |= SHM_LOCKED;
1049				shp->mlock_user = user;
1050			}
1051			goto out_unlock0;
1052		}
1053
1054		/* SHM_UNLOCK */
1055		if (!(shp->shm_perm.mode & SHM_LOCKED))
1056			goto out_unlock0;
1057		shmem_lock(shm_file, 0, shp->mlock_user);
1058		shp->shm_perm.mode &= ~SHM_LOCKED;
1059		shp->mlock_user = NULL;
1060		get_file(shm_file);
1061		ipc_unlock_object(&shp->shm_perm);
1062		rcu_read_unlock();
1063		shmem_unlock_mapping(shm_file->f_mapping);
1064
1065		fput(shm_file);
1066		return err;
1067	}
1068	default:
1069		return -EINVAL;
1070	}
1071
1072out_unlock0:
1073	ipc_unlock_object(&shp->shm_perm);
1074out_unlock1:
1075	rcu_read_unlock();
1076	return err;
1077}
1078
1079/*
1080 * Fix shmaddr, allocate descriptor, map shm, add attach descriptor to lists.
1081 *
1082 * NOTE! Despite the name, this is NOT a direct system call entrypoint. The
1083 * "raddr" thing points to kernel space, and there has to be a wrapper around
1084 * this.
1085 */
1086long do_shmat(int shmid, char __user *shmaddr, int shmflg, ulong *raddr,
1087	      unsigned long shmlba)
1088{
1089	struct shmid_kernel *shp;
1090	unsigned long addr;
1091	unsigned long size;
1092	struct file *file;
1093	int    err;
1094	unsigned long flags;
1095	unsigned long prot;
1096	int acc_mode;
1097	struct ipc_namespace *ns;
1098	struct shm_file_data *sfd;
1099	struct path path;
1100	fmode_t f_mode;
1101	unsigned long populate = 0;
1102
1103	err = -EINVAL;
1104	if (shmid < 0)
1105		goto out;
1106	else if ((addr = (ulong)shmaddr)) {
1107		if (addr & (shmlba - 1)) {
1108			if (shmflg & SHM_RND)
1109				addr &= ~(shmlba - 1);	   /* round down */
1110			else
1111#ifndef __ARCH_FORCE_SHMLBA
1112				if (addr & ~PAGE_MASK)
1113#endif
1114					goto out;
1115		}
1116		flags = MAP_SHARED | MAP_FIXED;
1117	} else {
1118		if ((shmflg & SHM_REMAP))
1119			goto out;
1120
1121		flags = MAP_SHARED;
1122	}
1123
1124	if (shmflg & SHM_RDONLY) {
1125		prot = PROT_READ;
1126		acc_mode = S_IRUGO;
1127		f_mode = FMODE_READ;
1128	} else {
1129		prot = PROT_READ | PROT_WRITE;
1130		acc_mode = S_IRUGO | S_IWUGO;
1131		f_mode = FMODE_READ | FMODE_WRITE;
1132	}
1133	if (shmflg & SHM_EXEC) {
1134		prot |= PROT_EXEC;
1135		acc_mode |= S_IXUGO;
1136	}
1137
1138	/*
1139	 * We cannot rely on the fs check since SYSV IPC does have an
1140	 * additional creator id...
1141	 */
1142	ns = current->nsproxy->ipc_ns;
1143	rcu_read_lock();
1144	shp = shm_obtain_object_check(ns, shmid);
1145	if (IS_ERR(shp)) {
1146		err = PTR_ERR(shp);
1147		goto out_unlock;
1148	}
1149
1150	err = -EACCES;
1151	if (ipcperms(ns, &shp->shm_perm, acc_mode))
1152		goto out_unlock;
1153
1154	err = security_shm_shmat(shp, shmaddr, shmflg);
1155	if (err)
1156		goto out_unlock;
1157
1158	ipc_lock_object(&shp->shm_perm);
1159
1160	/* check if shm_destroy() is tearing down shp */
1161	if (!ipc_valid_object(&shp->shm_perm)) {
1162		ipc_unlock_object(&shp->shm_perm);
1163		err = -EIDRM;
1164		goto out_unlock;
1165	}
1166
1167	path = shp->shm_file->f_path;
1168	path_get(&path);
1169	shp->shm_nattch++;
1170	size = i_size_read(d_inode(path.dentry));
1171	ipc_unlock_object(&shp->shm_perm);
1172	rcu_read_unlock();
1173
1174	err = -ENOMEM;
1175	sfd = kzalloc(sizeof(*sfd), GFP_KERNEL);
1176	if (!sfd) {
1177		path_put(&path);
1178		goto out_nattch;
1179	}
1180
1181	file = alloc_file(&path, f_mode,
1182			  is_file_hugepages(shp->shm_file) ?
1183				&shm_file_operations_huge :
1184				&shm_file_operations);
1185	err = PTR_ERR(file);
1186	if (IS_ERR(file)) {
1187		kfree(sfd);
1188		path_put(&path);
1189		goto out_nattch;
1190	}
1191
1192	file->private_data = sfd;
1193	file->f_mapping = shp->shm_file->f_mapping;
1194	sfd->id = shp->shm_perm.id;
1195	sfd->ns = get_ipc_ns(ns);
1196	sfd->file = shp->shm_file;
1197	sfd->vm_ops = NULL;
1198
1199	err = security_mmap_file(file, prot, flags);
1200	if (err)
1201		goto out_fput;
1202
1203	down_write(&current->mm->mmap_sem);
1204	if (addr && !(shmflg & SHM_REMAP)) {
1205		err = -EINVAL;
1206		if (addr + size < addr)
1207			goto invalid;
1208
1209		if (find_vma_intersection(current->mm, addr, addr + size))
1210			goto invalid;
1211	}
1212
1213	addr = do_mmap_pgoff(file, addr, size, prot, flags, 0, &populate);
1214	*raddr = addr;
1215	err = 0;
1216	if (IS_ERR_VALUE(addr))
1217		err = (long)addr;
1218invalid:
1219	up_write(&current->mm->mmap_sem);
1220	if (populate)
1221		mm_populate(addr, populate);
1222
1223out_fput:
1224	fput(file);
1225
1226out_nattch:
1227	down_write(&shm_ids(ns).rwsem);
1228	shp = shm_lock(ns, shmid);
1229	shp->shm_nattch--;
1230	if (shm_may_destroy(ns, shp))
1231		shm_destroy(ns, shp);
1232	else
1233		shm_unlock(shp);
1234	up_write(&shm_ids(ns).rwsem);
1235	return err;
1236
1237out_unlock:
1238	rcu_read_unlock();
1239out:
1240	return err;
1241}
1242
1243SYSCALL_DEFINE3(shmat, int, shmid, char __user *, shmaddr, int, shmflg)
1244{
1245	unsigned long ret;
1246	long err;
1247
1248	err = do_shmat(shmid, shmaddr, shmflg, &ret, SHMLBA);
1249	if (err)
1250		return err;
1251	force_successful_syscall_return();
1252	return (long)ret;
1253}
1254
1255/*
1256 * detach and kill segment if marked destroyed.
1257 * The work is done in shm_close.
1258 */
1259SYSCALL_DEFINE1(shmdt, char __user *, shmaddr)
1260{
1261	struct mm_struct *mm = current->mm;
1262	struct vm_area_struct *vma;
1263	unsigned long addr = (unsigned long)shmaddr;
1264	int retval = -EINVAL;
1265#ifdef CONFIG_MMU
1266	loff_t size = 0;
1267	struct file *file;
1268	struct vm_area_struct *next;
1269#endif
1270
1271	if (addr & ~PAGE_MASK)
1272		return retval;
1273
1274	down_write(&mm->mmap_sem);
1275
1276	/*
1277	 * This function tries to be smart and unmap shm segments that
1278	 * were modified by partial mlock or munmap calls:
1279	 * - It first determines the size of the shm segment that should be
1280	 *   unmapped: It searches for a vma that is backed by shm and that
1281	 *   started at address shmaddr. It records it's size and then unmaps
1282	 *   it.
1283	 * - Then it unmaps all shm vmas that started at shmaddr and that
1284	 *   are within the initially determined size and that are from the
1285	 *   same shm segment from which we determined the size.
1286	 * Errors from do_munmap are ignored: the function only fails if
1287	 * it's called with invalid parameters or if it's called to unmap
1288	 * a part of a vma. Both calls in this function are for full vmas,
1289	 * the parameters are directly copied from the vma itself and always
1290	 * valid - therefore do_munmap cannot fail. (famous last words?)
1291	 */
1292	/*
1293	 * If it had been mremap()'d, the starting address would not
1294	 * match the usual checks anyway. So assume all vma's are
1295	 * above the starting address given.
1296	 */
1297	vma = find_vma(mm, addr);
1298
1299#ifdef CONFIG_MMU
1300	while (vma) {
1301		next = vma->vm_next;
1302
1303		/*
1304		 * Check if the starting address would match, i.e. it's
1305		 * a fragment created by mprotect() and/or munmap(), or it
1306		 * otherwise it starts at this address with no hassles.
1307		 */
1308		if ((vma->vm_ops == &shm_vm_ops) &&
1309			(vma->vm_start - addr)/PAGE_SIZE == vma->vm_pgoff) {
1310
1311			/*
1312			 * Record the file of the shm segment being
1313			 * unmapped.  With mremap(), someone could place
1314			 * page from another segment but with equal offsets
1315			 * in the range we are unmapping.
1316			 */
1317			file = vma->vm_file;
1318			size = i_size_read(file_inode(vma->vm_file));
1319			do_munmap(mm, vma->vm_start, vma->vm_end - vma->vm_start);
1320			/*
1321			 * We discovered the size of the shm segment, so
1322			 * break out of here and fall through to the next
1323			 * loop that uses the size information to stop
1324			 * searching for matching vma's.
1325			 */
1326			retval = 0;
1327			vma = next;
1328			break;
1329		}
1330		vma = next;
1331	}
1332
1333	/*
1334	 * We need look no further than the maximum address a fragment
1335	 * could possibly have landed at. Also cast things to loff_t to
1336	 * prevent overflows and make comparisons vs. equal-width types.
1337	 */
1338	size = PAGE_ALIGN(size);
1339	while (vma && (loff_t)(vma->vm_end - addr) <= size) {
1340		next = vma->vm_next;
1341
1342		/* finding a matching vma now does not alter retval */
1343		if ((vma->vm_ops == &shm_vm_ops) &&
1344		    ((vma->vm_start - addr)/PAGE_SIZE == vma->vm_pgoff) &&
1345		    (vma->vm_file == file))
1346			do_munmap(mm, vma->vm_start, vma->vm_end - vma->vm_start);
1347		vma = next;
1348	}
1349
1350#else /* CONFIG_MMU */
1351	/* under NOMMU conditions, the exact address to be destroyed must be
1352	 * given */
1353	if (vma && vma->vm_start == addr && vma->vm_ops == &shm_vm_ops) {
1354		do_munmap(mm, vma->vm_start, vma->vm_end - vma->vm_start);
1355		retval = 0;
1356	}
1357
1358#endif
1359
1360	up_write(&mm->mmap_sem);
1361	return retval;
1362}
1363
1364#ifdef CONFIG_PROC_FS
1365static int sysvipc_shm_proc_show(struct seq_file *s, void *it)
1366{
1367	struct user_namespace *user_ns = seq_user_ns(s);
1368	struct shmid_kernel *shp = it;
1369	unsigned long rss = 0, swp = 0;
1370
1371	shm_add_rss_swap(shp, &rss, &swp);
1372
1373#if BITS_PER_LONG <= 32
1374#define SIZE_SPEC "%10lu"
1375#else
1376#define SIZE_SPEC "%21lu"
1377#endif
1378
1379	seq_printf(s,
1380		   "%10d %10d  %4o " SIZE_SPEC " %5u %5u  "
1381		   "%5lu %5u %5u %5u %5u %10lu %10lu %10lu "
1382		   SIZE_SPEC " " SIZE_SPEC "\n",
1383		   shp->shm_perm.key,
1384		   shp->shm_perm.id,
1385		   shp->shm_perm.mode,
1386		   shp->shm_segsz,
1387		   shp->shm_cprid,
1388		   shp->shm_lprid,
1389		   shp->shm_nattch,
1390		   from_kuid_munged(user_ns, shp->shm_perm.uid),
1391		   from_kgid_munged(user_ns, shp->shm_perm.gid),
1392		   from_kuid_munged(user_ns, shp->shm_perm.cuid),
1393		   from_kgid_munged(user_ns, shp->shm_perm.cgid),
1394		   shp->shm_atim,
1395		   shp->shm_dtim,
1396		   shp->shm_ctim,
1397		   rss * PAGE_SIZE,
1398		   swp * PAGE_SIZE);
1399
1400	return 0;
1401}
1402#endif
1403