1/*
2 * Copyright (C) 2004 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com)
3 * Licensed under the GPL
4 */
5
6#include <unistd.h>
7#include <sched.h>
8#include <signal.h>
9#include <errno.h>
10#include <sys/time.h>
11#include <asm/unistd.h>
12#include <aio.h>
13#include <init.h>
14#include <kern_util.h>
15#include <os.h>
16
17struct aio_thread_req {
18	enum aio_type type;
19	int io_fd;
20	unsigned long long offset;
21	char *buf;
22	int len;
23	struct aio_context *aio;
24};
25
26#if defined(HAVE_AIO_ABI)
27#include <linux/aio_abi.h>
28
29/*
30 * If we have the headers, we are going to build with AIO enabled.
31 * If we don't have aio in libc, we define the necessary stubs here.
32 */
33
34#if !defined(HAVE_AIO_LIBC)
35
36static long io_setup(int n, aio_context_t *ctxp)
37{
38	return syscall(__NR_io_setup, n, ctxp);
39}
40
41static long io_submit(aio_context_t ctx, long nr, struct iocb **iocbpp)
42{
43	return syscall(__NR_io_submit, ctx, nr, iocbpp);
44}
45
46static long io_getevents(aio_context_t ctx_id, long min_nr, long nr,
47			 struct io_event *events, struct timespec *timeout)
48{
49	return syscall(__NR_io_getevents, ctx_id, min_nr, nr, events, timeout);
50}
51
52#endif
53
54/*
55 * The AIO_MMAP cases force the mmapped page into memory here
56 * rather than in whatever place first touches the data.  I used
57 * to do this by touching the page, but that's delicate because
58 * gcc is prone to optimizing that away.  So, what's done here
59 * is we read from the descriptor from which the page was
60 * mapped.  The caller is required to pass an offset which is
61 * inside the page that was mapped.  Thus, when the read
62 * returns, we know that the page is in the page cache, and
63 * that it now backs the mmapped area.
64 */
65
66static int do_aio(aio_context_t ctx, enum aio_type type, int fd, char *buf,
67		  int len, unsigned long long offset, struct aio_context *aio)
68{
69	struct iocb *iocbp = & ((struct iocb) {
70				    .aio_data       = (unsigned long) aio,
71				    .aio_fildes     = fd,
72				    .aio_buf        = (unsigned long) buf,
73				    .aio_nbytes     = len,
74				    .aio_offset     = offset
75			     });
76	char c;
77
78	switch (type) {
79	case AIO_READ:
80		iocbp->aio_lio_opcode = IOCB_CMD_PREAD;
81		break;
82	case AIO_WRITE:
83		iocbp->aio_lio_opcode = IOCB_CMD_PWRITE;
84		break;
85	case AIO_MMAP:
86		iocbp->aio_lio_opcode = IOCB_CMD_PREAD;
87		iocbp->aio_buf = (unsigned long) &c;
88		iocbp->aio_nbytes = sizeof(c);
89		break;
90	default:
91		printk(UM_KERN_ERR "Bogus op in do_aio - %d\n", type);
92		return -EINVAL;
93	}
94
95	return (io_submit(ctx, 1, &iocbp) > 0) ? 0 : -errno;
96}
97
98/* Initialized in an initcall and unchanged thereafter */
99static aio_context_t ctx = 0;
100
101static int aio_thread(void *arg)
102{
103	struct aio_thread_reply reply;
104	struct io_event event;
105	int err, n, reply_fd;
106
107	os_fix_helper_signals();
108	while (1) {
109		n = io_getevents(ctx, 1, 1, &event, NULL);
110		if (n < 0) {
111			if (errno == EINTR)
112				continue;
113			printk(UM_KERN_ERR "aio_thread - io_getevents failed, "
114			       "errno = %d\n", errno);
115		}
116		else {
117			reply = ((struct aio_thread_reply)
118				{ .data = (void *) (long) event.data,
119						.err	= event.res });
120			reply_fd = ((struct aio_context *) reply.data)->reply_fd;
121			err = write(reply_fd, &reply, sizeof(reply));
122			if (err != sizeof(reply))
123				printk(UM_KERN_ERR "aio_thread - write failed, "
124				       "fd = %d, err = %d\n", reply_fd, errno);
125		}
126	}
127	return 0;
128}
129
130#endif
131
132static int do_not_aio(struct aio_thread_req *req)
133{
134	char c;
135	unsigned long long actual;
136	int n;
137
138	actual = lseek64(req->io_fd, req->offset, SEEK_SET);
139	if (actual != req->offset)
140		return -errno;
141
142	switch (req->type) {
143	case AIO_READ:
144		n = read(req->io_fd, req->buf, req->len);
145		break;
146	case AIO_WRITE:
147		n = write(req->io_fd, req->buf, req->len);
148		break;
149	case AIO_MMAP:
150		n = read(req->io_fd, &c, sizeof(c));
151		break;
152	default:
153		printk(UM_KERN_ERR "do_not_aio - bad request type : %d\n",
154		       req->type);
155		return -EINVAL;
156	}
157
158	if (n < 0)
159		return -errno;
160	return 0;
161}
162
163/* These are initialized in initcalls and not changed */
164static int aio_req_fd_r = -1;
165static int aio_req_fd_w = -1;
166static int aio_pid = -1;
167static unsigned long aio_stack;
168
169static int not_aio_thread(void *arg)
170{
171	struct aio_thread_req req;
172	struct aio_thread_reply reply;
173	int err;
174
175	os_fix_helper_signals();
176	while (1) {
177		err = read(aio_req_fd_r, &req, sizeof(req));
178		if (err != sizeof(req)) {
179			if (err < 0)
180				printk(UM_KERN_ERR "not_aio_thread - "
181				       "read failed, fd = %d, err = %d\n",
182				       aio_req_fd_r,
183				       errno);
184			else {
185				printk(UM_KERN_ERR "not_aio_thread - short "
186				       "read, fd = %d, length = %d\n",
187				       aio_req_fd_r, err);
188			}
189			continue;
190		}
191		err = do_not_aio(&req);
192		reply = ((struct aio_thread_reply) { .data 	= req.aio,
193						     .err	= err });
194		err = write(req.aio->reply_fd, &reply, sizeof(reply));
195		if (err != sizeof(reply))
196			printk(UM_KERN_ERR "not_aio_thread - write failed, "
197			       "fd = %d, err = %d\n", req.aio->reply_fd, errno);
198	}
199
200	return 0;
201}
202
203static int init_aio_24(void)
204{
205	int fds[2], err;
206
207	err = os_pipe(fds, 1, 1);
208	if (err)
209		goto out;
210
211	aio_req_fd_w = fds[0];
212	aio_req_fd_r = fds[1];
213
214	err = os_set_fd_block(aio_req_fd_w, 0);
215	if (err)
216		goto out_close_pipe;
217
218	err = run_helper_thread(not_aio_thread, NULL,
219				CLONE_FILES | CLONE_VM, &aio_stack);
220	if (err < 0)
221		goto out_close_pipe;
222
223	aio_pid = err;
224	goto out;
225
226out_close_pipe:
227	close(fds[0]);
228	close(fds[1]);
229	aio_req_fd_w = -1;
230	aio_req_fd_r = -1;
231out:
232#ifndef HAVE_AIO_ABI
233	printk(UM_KERN_INFO "/usr/include/linux/aio_abi.h not present during "
234	       "build\n");
235#endif
236	printk(UM_KERN_INFO "2.6 host AIO support not used - falling back to "
237	       "I/O thread\n");
238	return 0;
239}
240
241#ifdef HAVE_AIO_ABI
242#define DEFAULT_24_AIO 0
243static int init_aio_26(void)
244{
245	int err;
246
247	if (io_setup(256, &ctx)) {
248		err = -errno;
249		printk(UM_KERN_ERR "aio_thread failed to initialize context, "
250		       "err = %d\n", errno);
251		return err;
252	}
253
254	err = run_helper_thread(aio_thread, NULL,
255				CLONE_FILES | CLONE_VM, &aio_stack);
256	if (err < 0)
257		return err;
258
259	aio_pid = err;
260
261	printk(UM_KERN_INFO "Using 2.6 host AIO\n");
262	return 0;
263}
264
265static int submit_aio_26(enum aio_type type, int io_fd, char *buf, int len,
266			 unsigned long long offset, struct aio_context *aio)
267{
268	struct aio_thread_reply reply;
269	int err;
270
271	err = do_aio(ctx, type, io_fd, buf, len, offset, aio);
272	if (err) {
273		reply = ((struct aio_thread_reply) { .data = aio,
274					 .err  = err });
275		err = write(aio->reply_fd, &reply, sizeof(reply));
276		if (err != sizeof(reply)) {
277			err = -errno;
278			printk(UM_KERN_ERR "submit_aio_26 - write failed, "
279			       "fd = %d, err = %d\n", aio->reply_fd, -err);
280		}
281		else err = 0;
282	}
283
284	return err;
285}
286
287#else
288#define DEFAULT_24_AIO 1
289static int init_aio_26(void)
290{
291	return -ENOSYS;
292}
293
294static int submit_aio_26(enum aio_type type, int io_fd, char *buf, int len,
295			 unsigned long long offset, struct aio_context *aio)
296{
297	return -ENOSYS;
298}
299#endif
300
301/* Initialized in an initcall and unchanged thereafter */
302static int aio_24 = DEFAULT_24_AIO;
303
304static int __init set_aio_24(char *name, int *add)
305{
306	aio_24 = 1;
307	return 0;
308}
309
310__uml_setup("aio=2.4", set_aio_24,
311"aio=2.4\n"
312"    This is used to force UML to use 2.4-style AIO even when 2.6 AIO is\n"
313"    available.  2.4 AIO is a single thread that handles one request at a\n"
314"    time, synchronously.  2.6 AIO is a thread which uses the 2.6 AIO \n"
315"    interface to handle an arbitrary number of pending requests.  2.6 AIO \n"
316"    is not available in tt mode, on 2.4 hosts, or when UML is built with\n"
317"    /usr/include/linux/aio_abi.h not available.  Many distributions don't\n"
318"    include aio_abi.h, so you will need to copy it from a kernel tree to\n"
319"    your /usr/include/linux in order to build an AIO-capable UML\n\n"
320);
321
322static int init_aio(void)
323{
324	int err;
325
326	if (!aio_24) {
327		err = init_aio_26();
328		if (err && (errno == ENOSYS)) {
329			printk(UM_KERN_INFO "2.6 AIO not supported on the "
330			       "host - reverting to 2.4 AIO\n");
331			aio_24 = 1;
332		}
333		else return err;
334	}
335
336	if (aio_24)
337		return init_aio_24();
338
339	return 0;
340}
341
342/*
343 * The reason for the __initcall/__uml_exitcall asymmetry is that init_aio
344 * needs to be called when the kernel is running because it calls run_helper,
345 * which needs get_free_page.  exit_aio is a __uml_exitcall because the generic
346 * kernel does not run __exitcalls on shutdown, and can't because many of them
347 * break when called outside of module unloading.
348 */
349__initcall(init_aio);
350
351static void exit_aio(void)
352{
353	if (aio_pid != -1) {
354		os_kill_process(aio_pid, 1);
355		free_stack(aio_stack, 0);
356	}
357}
358
359__uml_exitcall(exit_aio);
360
361static int submit_aio_24(enum aio_type type, int io_fd, char *buf, int len,
362			 unsigned long long offset, struct aio_context *aio)
363{
364	struct aio_thread_req req = { .type 		= type,
365				      .io_fd		= io_fd,
366				      .offset		= offset,
367				      .buf		= buf,
368				      .len		= len,
369				      .aio		= aio,
370	};
371	int err;
372
373	err = write(aio_req_fd_w, &req, sizeof(req));
374	if (err == sizeof(req))
375		err = 0;
376	else err = -errno;
377
378	return err;
379}
380
381int submit_aio(enum aio_type type, int io_fd, char *buf, int len,
382	       unsigned long long offset, int reply_fd,
383	       struct aio_context *aio)
384{
385	aio->reply_fd = reply_fd;
386	if (aio_24)
387		return submit_aio_24(type, io_fd, buf, len, offset, aio);
388	else
389		return submit_aio_26(type, io_fd, buf, len, offset, aio);
390}
391