1 /*
2  * GPL HEADER START
3  *
4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
5  *
6  * This program is free software; you can redistribute it and/or modify
7  * it under the terms of the GNU General Public License version 2 only,
8  * as published by the Free Software Foundation.
9  *
10  * This program is distributed in the hope that it will be useful, but
11  * WITHOUT ANY WARRANTY; without even the implied warranty of
12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13  * General Public License version 2 for more details (a copy is included
14  * in the LICENSE file that accompanied this code).
15  *
16  * You should have received a copy of the GNU General Public License
17  * version 2 along with this program; If not, see
18  * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
19  *
20  * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
21  * CA 95054 USA or visit www.sun.com if you need additional information or
22  * have any questions.
23  *
24  * GPL HEADER END
25  */
26 /*
27  * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
28  * Use is subject to license terms.
29  *
30  * Copyright (c) 2011, 2012, Intel Corporation.
31  */
32 /*
33  * This file is part of Lustre, http://www.lustre.org/
34  * Lustre is a trademark of Sun Microsystems, Inc.
35  *
36  * lustre/llite/dir.c
37  *
38  * Directory code for lustre client.
39  */
40 
41 #include <linux/fs.h>
42 #include <linux/pagemap.h>
43 #include <linux/mm.h>
44 #include <linux/uaccess.h>
45 #include <linux/buffer_head.h>   /* for wait_on_buffer */
46 #include <linux/pagevec.h>
47 #include <linux/prefetch.h>
48 
49 #define DEBUG_SUBSYSTEM S_LLITE
50 
51 #include "../include/obd_support.h"
52 #include "../include/obd_class.h"
53 #include "../include/lustre_lib.h"
54 #include "../include/lustre/lustre_idl.h"
55 #include "../include/lustre_lite.h"
56 #include "../include/lustre_dlm.h"
57 #include "../include/lustre_fid.h"
58 #include "llite_internal.h"
59 
60 /*
61  * (new) readdir implementation overview.
62  *
63  * Original lustre readdir implementation cached exact copy of raw directory
64  * pages on the client. These pages were indexed in client page cache by
65  * logical offset in the directory file. This design, while very simple and
66  * intuitive had some inherent problems:
67  *
68  *     . it implies that byte offset to the directory entry serves as a
69  *     telldir(3)/seekdir(3) cookie, but that offset is not stable: in
70  *     ext3/htree directory entries may move due to splits, and more
71  *     importantly,
72  *
73  *     . it is incompatible with the design of split directories for cmd3,
74  *     that assumes that names are distributed across nodes based on their
75  *     hash, and so readdir should be done in hash order.
76  *
77  * New readdir implementation does readdir in hash order, and uses hash of a
78  * file name as a telldir/seekdir cookie. This led to number of complications:
79  *
80  *     . hash is not unique, so it cannot be used to index cached directory
81  *     pages on the client (note, that it requires a whole pageful of hash
82  *     collided entries to cause two pages to have identical hashes);
83  *
84  *     . hash is not unique, so it cannot, strictly speaking, be used as an
85  *     entry cookie. ext3/htree has the same problem and lustre implementation
86  *     mimics their solution: seekdir(hash) positions directory at the first
87  *     entry with the given hash.
88  *
89  * Client side.
90  *
91  * 0. caching
92  *
93  * Client caches directory pages using hash of the first entry as an index. As
94  * noted above hash is not unique, so this solution doesn't work as is:
95  * special processing is needed for "page hash chains" (i.e., sequences of
96  * pages filled with entries all having the same hash value).
97  *
98  * First, such chains have to be detected. To this end, server returns to the
99  * client the hash of the first entry on the page next to one returned. When
100  * client detects that this hash is the same as hash of the first entry on the
101  * returned page, page hash collision has to be handled. Pages in the
102  * hash chain, except first one, are termed "overflow pages".
103  *
104  * Solution to index uniqueness problem is to not cache overflow
105  * pages. Instead, when page hash collision is detected, all overflow pages
106  * from emerging chain are immediately requested from the server and placed in
107  * a special data structure (struct ll_dir_chain). This data structure is used
108  * by ll_readdir() to process entries from overflow pages. When readdir
109  * invocation finishes, overflow pages are discarded. If page hash collision
110  * chain weren't completely processed, next call to readdir will again detect
111  * page hash collision, again read overflow pages in, process next portion of
112  * entries and again discard the pages. This is not as wasteful as it looks,
113  * because, given reasonable hash, page hash collisions are extremely rare.
114  *
115  * 1. directory positioning
116  *
117  * When seekdir(hash) is called, original
118  *
119  *
120  *
121  *
122  *
123  *
124  *
125  *
126  * Server.
127  *
128  * identification of and access to overflow pages
129  *
130  * page format
131  *
132  * Page in MDS_READPAGE RPC is packed in LU_PAGE_SIZE, and each page contains
133  * a header lu_dirpage which describes the start/end hash, and whether this
134  * page is empty (contains no dir entry) or hash collide with next page.
135  * After client receives reply, several pages will be integrated into dir page
136  * in PAGE_CACHE_SIZE (if PAGE_CACHE_SIZE greater than LU_PAGE_SIZE), and the
137  * lu_dirpage for this integrated page will be adjusted. See
138  * lmv_adjust_dirpages().
139  *
140  */
141 
142 /* returns the page unlocked, but with a reference */
ll_dir_filler(void * _hash,struct page * page0)143 static int ll_dir_filler(void *_hash, struct page *page0)
144 {
145 	struct inode *inode = page0->mapping->host;
146 	int hash64 = ll_i2sbi(inode)->ll_flags & LL_SBI_64BIT_HASH;
147 	struct obd_export *exp = ll_i2sbi(inode)->ll_md_exp;
148 	struct ptlrpc_request *request;
149 	struct mdt_body *body;
150 	struct md_op_data *op_data;
151 	__u64 hash = *((__u64 *)_hash);
152 	struct page **page_pool;
153 	struct page *page;
154 	struct lu_dirpage *dp;
155 	int max_pages = ll_i2sbi(inode)->ll_md_brw_size >> PAGE_CACHE_SHIFT;
156 	int nrdpgs = 0; /* number of pages read actually */
157 	int npages;
158 	int i;
159 	int rc;
160 
161 	CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p) hash %llu\n",
162 	       inode->i_ino, inode->i_generation, inode, hash);
163 
164 	LASSERT(max_pages > 0 && max_pages <= MD_MAX_BRW_PAGES);
165 
166 	page_pool = kcalloc(max_pages, sizeof(page), GFP_NOFS);
167 	if (page_pool) {
168 		page_pool[0] = page0;
169 	} else {
170 		page_pool = &page0;
171 		max_pages = 1;
172 	}
173 	for (npages = 1; npages < max_pages; npages++) {
174 		page = page_cache_alloc_cold(inode->i_mapping);
175 		if (!page)
176 			break;
177 		page_pool[npages] = page;
178 	}
179 
180 	op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
181 				     LUSTRE_OPC_ANY, NULL);
182 	op_data->op_npages = npages;
183 	op_data->op_offset = hash;
184 	rc = md_readpage(exp, op_data, page_pool, &request);
185 	ll_finish_md_op_data(op_data);
186 	if (rc < 0) {
187 		/* page0 is special, which was added into page cache early */
188 		delete_from_page_cache(page0);
189 	} else if (rc == 0) {
190 		body = req_capsule_server_get(&request->rq_pill, &RMF_MDT_BODY);
191 		/* Checked by mdc_readpage() */
192 		LASSERT(body != NULL);
193 
194 		if (body->valid & OBD_MD_FLSIZE)
195 			cl_isize_write(inode, body->size);
196 
197 		nrdpgs = (request->rq_bulk->bd_nob_transferred+PAGE_CACHE_SIZE-1)
198 			 >> PAGE_CACHE_SHIFT;
199 		SetPageUptodate(page0);
200 	}
201 	unlock_page(page0);
202 	ptlrpc_req_finished(request);
203 
204 	CDEBUG(D_VFSTRACE, "read %d/%d pages\n", nrdpgs, npages);
205 
206 	ll_pagevec_init(&lru_pvec, 0);
207 	for (i = 1; i < npages; i++) {
208 		unsigned long offset;
209 		int ret;
210 
211 		page = page_pool[i];
212 
213 		if (rc < 0 || i >= nrdpgs) {
214 			page_cache_release(page);
215 			continue;
216 		}
217 
218 		SetPageUptodate(page);
219 
220 		dp = kmap(page);
221 		hash = le64_to_cpu(dp->ldp_hash_start);
222 		kunmap(page);
223 
224 		offset = hash_x_index(hash, hash64);
225 
226 		prefetchw(&page->flags);
227 		ret = add_to_page_cache_lru(page, inode->i_mapping, offset,
228 					    GFP_KERNEL);
229 		if (ret == 0) {
230 			unlock_page(page);
231 			if (ll_pagevec_add(&lru_pvec, page) == 0)
232 				ll_pagevec_lru_add_file(&lru_pvec);
233 		} else {
234 			CDEBUG(D_VFSTRACE, "page %lu add to page cache failed: %d\n",
235 			       offset, ret);
236 		}
237 		page_cache_release(page);
238 	}
239 	ll_pagevec_lru_add_file(&lru_pvec);
240 
241 	if (page_pool != &page0)
242 		OBD_FREE(page_pool, sizeof(struct page *) * max_pages);
243 	return rc;
244 }
245 
ll_check_page(struct inode * dir,struct page * page)246 static void ll_check_page(struct inode *dir, struct page *page)
247 {
248 	/* XXX: check page format later */
249 	SetPageChecked(page);
250 }
251 
ll_release_page(struct page * page,int remove)252 void ll_release_page(struct page *page, int remove)
253 {
254 	kunmap(page);
255 	if (remove) {
256 		lock_page(page);
257 		if (likely(page->mapping != NULL))
258 			truncate_complete_page(page->mapping, page);
259 		unlock_page(page);
260 	}
261 	page_cache_release(page);
262 }
263 
264 /*
265  * Find, kmap and return page that contains given hash.
266  */
ll_dir_page_locate(struct inode * dir,__u64 * hash,__u64 * start,__u64 * end)267 static struct page *ll_dir_page_locate(struct inode *dir, __u64 *hash,
268 				       __u64 *start, __u64 *end)
269 {
270 	int hash64 = ll_i2sbi(dir)->ll_flags & LL_SBI_64BIT_HASH;
271 	struct address_space *mapping = dir->i_mapping;
272 	/*
273 	 * Complement of hash is used as an index so that
274 	 * radix_tree_gang_lookup() can be used to find a page with starting
275 	 * hash _smaller_ than one we are looking for.
276 	 */
277 	unsigned long offset = hash_x_index(*hash, hash64);
278 	struct page *page;
279 	int found;
280 
281 	spin_lock_irq(&mapping->tree_lock);
282 	found = radix_tree_gang_lookup(&mapping->page_tree,
283 				       (void **)&page, offset, 1);
284 	if (found > 0 && !radix_tree_exceptional_entry(page)) {
285 		struct lu_dirpage *dp;
286 
287 		page_cache_get(page);
288 		spin_unlock_irq(&mapping->tree_lock);
289 		/*
290 		 * In contrast to find_lock_page() we are sure that directory
291 		 * page cannot be truncated (while DLM lock is held) and,
292 		 * hence, can avoid restart.
293 		 *
294 		 * In fact, page cannot be locked here at all, because
295 		 * ll_dir_filler() does synchronous io.
296 		 */
297 		wait_on_page_locked(page);
298 		if (PageUptodate(page)) {
299 			dp = kmap(page);
300 			if (BITS_PER_LONG == 32 && hash64) {
301 				*start = le64_to_cpu(dp->ldp_hash_start) >> 32;
302 				*end   = le64_to_cpu(dp->ldp_hash_end) >> 32;
303 				*hash  = *hash >> 32;
304 			} else {
305 				*start = le64_to_cpu(dp->ldp_hash_start);
306 				*end   = le64_to_cpu(dp->ldp_hash_end);
307 			}
308 			LASSERTF(*start <= *hash, "start = %#llx,end = %#llx,hash = %#llx\n",
309 				 *start, *end, *hash);
310 			CDEBUG(D_VFSTRACE, "page %lu [%llu %llu], hash %llu\n",
311 			       offset, *start, *end, *hash);
312 			if (*hash > *end) {
313 				ll_release_page(page, 0);
314 				page = NULL;
315 			} else if (*end != *start && *hash == *end) {
316 				/*
317 				 * upon hash collision, remove this page,
318 				 * otherwise put page reference, and
319 				 * ll_get_dir_page() will issue RPC to fetch
320 				 * the page we want.
321 				 */
322 				ll_release_page(page,
323 				    le32_to_cpu(dp->ldp_flags) & LDF_COLLIDE);
324 				page = NULL;
325 			}
326 		} else {
327 			page_cache_release(page);
328 			page = ERR_PTR(-EIO);
329 		}
330 
331 	} else {
332 		spin_unlock_irq(&mapping->tree_lock);
333 		page = NULL;
334 	}
335 	return page;
336 }
337 
ll_get_dir_page(struct inode * dir,__u64 hash,struct ll_dir_chain * chain)338 struct page *ll_get_dir_page(struct inode *dir, __u64 hash,
339 			     struct ll_dir_chain *chain)
340 {
341 	ldlm_policy_data_t policy = {.l_inodebits = {MDS_INODELOCK_UPDATE} };
342 	struct address_space *mapping = dir->i_mapping;
343 	struct lustre_handle lockh;
344 	struct lu_dirpage *dp;
345 	struct page *page;
346 	ldlm_mode_t mode;
347 	int rc;
348 	__u64 start = 0;
349 	__u64 end = 0;
350 	__u64 lhash = hash;
351 	struct ll_inode_info *lli = ll_i2info(dir);
352 	int hash64 = ll_i2sbi(dir)->ll_flags & LL_SBI_64BIT_HASH;
353 
354 	mode = LCK_PR;
355 	rc = md_lock_match(ll_i2sbi(dir)->ll_md_exp, LDLM_FL_BLOCK_GRANTED,
356 			   ll_inode2fid(dir), LDLM_IBITS, &policy, mode, &lockh);
357 	if (!rc) {
358 		struct ldlm_enqueue_info einfo = {
359 			.ei_type = LDLM_IBITS,
360 			.ei_mode = mode,
361 			.ei_cb_bl = ll_md_blocking_ast,
362 			.ei_cb_cp = ldlm_completion_ast,
363 		};
364 		struct lookup_intent it = { .it_op = IT_READDIR };
365 		struct ptlrpc_request *request;
366 		struct md_op_data *op_data;
367 
368 		op_data = ll_prep_md_op_data(NULL, dir, dir, NULL, 0, 0,
369 		LUSTRE_OPC_ANY, NULL);
370 		if (IS_ERR(op_data))
371 			return (void *)op_data;
372 
373 		rc = md_enqueue(ll_i2sbi(dir)->ll_md_exp, &einfo, &it,
374 				op_data, &lockh, NULL, 0, NULL, 0);
375 
376 		ll_finish_md_op_data(op_data);
377 
378 		request = (struct ptlrpc_request *)it.d.lustre.it_data;
379 		if (request)
380 			ptlrpc_req_finished(request);
381 		if (rc < 0) {
382 			CERROR("lock enqueue: "DFID" at %llu: rc %d\n",
383 				PFID(ll_inode2fid(dir)), hash, rc);
384 			return ERR_PTR(rc);
385 		}
386 
387 		CDEBUG(D_INODE, "setting lr_lvb_inode to inode %p (%lu/%u)\n",
388 		       dir, dir->i_ino, dir->i_generation);
389 		md_set_lock_data(ll_i2sbi(dir)->ll_md_exp,
390 				 &it.d.lustre.it_lock_handle, dir, NULL);
391 	} else {
392 		/* for cross-ref object, l_ast_data of the lock may not be set,
393 		 * we reset it here */
394 		md_set_lock_data(ll_i2sbi(dir)->ll_md_exp, &lockh.cookie,
395 				 dir, NULL);
396 	}
397 	ldlm_lock_dump_handle(D_OTHER, &lockh);
398 
399 	mutex_lock(&lli->lli_readdir_mutex);
400 	page = ll_dir_page_locate(dir, &lhash, &start, &end);
401 	if (IS_ERR(page)) {
402 		CERROR("dir page locate: "DFID" at %llu: rc %ld\n",
403 		       PFID(ll_inode2fid(dir)), lhash, PTR_ERR(page));
404 		goto out_unlock;
405 	} else if (page != NULL) {
406 		/*
407 		 * XXX nikita: not entirely correct handling of a corner case:
408 		 * suppose hash chain of entries with hash value HASH crosses
409 		 * border between pages P0 and P1. First both P0 and P1 are
410 		 * cached, seekdir() is called for some entry from the P0 part
411 		 * of the chain. Later P0 goes out of cache. telldir(HASH)
412 		 * happens and finds P1, as it starts with matching hash
413 		 * value. Remaining entries from P0 part of the chain are
414 		 * skipped. (Is that really a bug?)
415 		 *
416 		 * Possible solutions: 0. don't cache P1 is such case, handle
417 		 * it as an "overflow" page. 1. invalidate all pages at
418 		 * once. 2. use HASH|1 as an index for P1.
419 		 */
420 		goto hash_collision;
421 	}
422 
423 	page = read_cache_page(mapping, hash_x_index(hash, hash64),
424 			       ll_dir_filler, &lhash);
425 	if (IS_ERR(page)) {
426 		CERROR("read cache page: "DFID" at %llu: rc %ld\n",
427 		       PFID(ll_inode2fid(dir)), hash, PTR_ERR(page));
428 		goto out_unlock;
429 	}
430 
431 	wait_on_page_locked(page);
432 	(void)kmap(page);
433 	if (!PageUptodate(page)) {
434 		CERROR("page not updated: "DFID" at %llu: rc %d\n",
435 		       PFID(ll_inode2fid(dir)), hash, -5);
436 		goto fail;
437 	}
438 	if (!PageChecked(page))
439 		ll_check_page(dir, page);
440 	if (PageError(page)) {
441 		CERROR("page error: "DFID" at %llu: rc %d\n",
442 		       PFID(ll_inode2fid(dir)), hash, -5);
443 		goto fail;
444 	}
445 hash_collision:
446 	dp = page_address(page);
447 	if (BITS_PER_LONG == 32 && hash64) {
448 		start = le64_to_cpu(dp->ldp_hash_start) >> 32;
449 		end   = le64_to_cpu(dp->ldp_hash_end) >> 32;
450 		lhash = hash >> 32;
451 	} else {
452 		start = le64_to_cpu(dp->ldp_hash_start);
453 		end   = le64_to_cpu(dp->ldp_hash_end);
454 		lhash = hash;
455 	}
456 	if (end == start) {
457 		LASSERT(start == lhash);
458 		CWARN("Page-wide hash collision: %llu\n", end);
459 		if (BITS_PER_LONG == 32 && hash64)
460 			CWARN("Real page-wide hash collision at [%llu %llu] with hash %llu\n",
461 			      le64_to_cpu(dp->ldp_hash_start),
462 			      le64_to_cpu(dp->ldp_hash_end), hash);
463 		/*
464 		 * Fetch whole overflow chain...
465 		 *
466 		 * XXX not yet.
467 		 */
468 		goto fail;
469 	}
470 out_unlock:
471 	mutex_unlock(&lli->lli_readdir_mutex);
472 	ldlm_lock_decref(&lockh, mode);
473 	return page;
474 
475 fail:
476 	ll_release_page(page, 1);
477 	page = ERR_PTR(-EIO);
478 	goto out_unlock;
479 }
480 
ll_dir_read(struct inode * inode,struct dir_context * ctx)481 int ll_dir_read(struct inode *inode, struct dir_context *ctx)
482 {
483 	struct ll_inode_info *info       = ll_i2info(inode);
484 	struct ll_sb_info    *sbi	= ll_i2sbi(inode);
485 	__u64		   pos		= ctx->pos;
486 	int		   api32      = ll_need_32bit_api(sbi);
487 	int		   hash64     = sbi->ll_flags & LL_SBI_64BIT_HASH;
488 	struct page	  *page;
489 	struct ll_dir_chain   chain;
490 	int		   done = 0;
491 	int		   rc = 0;
492 
493 	ll_dir_chain_init(&chain);
494 
495 	page = ll_get_dir_page(inode, pos, &chain);
496 
497 	while (rc == 0 && !done) {
498 		struct lu_dirpage *dp;
499 		struct lu_dirent  *ent;
500 
501 		if (!IS_ERR(page)) {
502 			/*
503 			 * If page is empty (end of directory is reached),
504 			 * use this value.
505 			 */
506 			__u64 hash = MDS_DIR_END_OFF;
507 			__u64 next;
508 
509 			dp = page_address(page);
510 			for (ent = lu_dirent_start(dp); ent != NULL && !done;
511 			     ent = lu_dirent_next(ent)) {
512 				__u16	  type;
513 				int	    namelen;
514 				struct lu_fid  fid;
515 				__u64	  lhash;
516 				__u64	  ino;
517 
518 				/*
519 				 * XXX: implement correct swabbing here.
520 				 */
521 
522 				hash = le64_to_cpu(ent->lde_hash);
523 				if (hash < pos)
524 					/*
525 					 * Skip until we find target hash
526 					 * value.
527 					 */
528 					continue;
529 
530 				namelen = le16_to_cpu(ent->lde_namelen);
531 				if (namelen == 0)
532 					/*
533 					 * Skip dummy record.
534 					 */
535 					continue;
536 
537 				if (api32 && hash64)
538 					lhash = hash >> 32;
539 				else
540 					lhash = hash;
541 				fid_le_to_cpu(&fid, &ent->lde_fid);
542 				ino = cl_fid_build_ino(&fid, api32);
543 				type = ll_dirent_type_get(ent);
544 				ctx->pos = lhash;
545 				/* For 'll_nfs_get_name_filldir()', it will try
546 				 * to access the 'ent' through its 'lde_name',
547 				 * so the parameter 'name' for 'ctx->actor()'
548 				 * must be part of the 'ent'.
549 				 */
550 				done = !dir_emit(ctx, ent->lde_name,
551 						 namelen, ino, type);
552 			}
553 			next = le64_to_cpu(dp->ldp_hash_end);
554 			if (!done) {
555 				pos = next;
556 				if (pos == MDS_DIR_END_OFF) {
557 					/*
558 					 * End of directory reached.
559 					 */
560 					done = 1;
561 					ll_release_page(page, 0);
562 				} else if (1 /* chain is exhausted*/) {
563 					/*
564 					 * Normal case: continue to the next
565 					 * page.
566 					 */
567 					ll_release_page(page,
568 					    le32_to_cpu(dp->ldp_flags) &
569 							LDF_COLLIDE);
570 					next = pos;
571 					page = ll_get_dir_page(inode, pos,
572 							       &chain);
573 				} else {
574 					/*
575 					 * go into overflow page.
576 					 */
577 					LASSERT(le32_to_cpu(dp->ldp_flags) &
578 						LDF_COLLIDE);
579 					ll_release_page(page, 1);
580 				}
581 			} else {
582 				pos = hash;
583 				ll_release_page(page, 0);
584 			}
585 		} else {
586 			rc = PTR_ERR(page);
587 			CERROR("error reading dir "DFID" at %lu: rc %d\n",
588 			       PFID(&info->lli_fid), (unsigned long)pos, rc);
589 		}
590 	}
591 
592 	ctx->pos = pos;
593 	ll_dir_chain_fini(&chain);
594 	return rc;
595 }
596 
ll_readdir(struct file * filp,struct dir_context * ctx)597 static int ll_readdir(struct file *filp, struct dir_context *ctx)
598 {
599 	struct inode		*inode	= file_inode(filp);
600 	struct ll_file_data	*lfd	= LUSTRE_FPRIVATE(filp);
601 	struct ll_sb_info	*sbi	= ll_i2sbi(inode);
602 	int			hash64	= sbi->ll_flags & LL_SBI_64BIT_HASH;
603 	int			api32	= ll_need_32bit_api(sbi);
604 	int			rc;
605 
606 	CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p) pos %lu/%llu 32bit_api %d\n",
607 	       inode->i_ino, inode->i_generation,
608 	       inode, (unsigned long)lfd->lfd_pos, i_size_read(inode), api32);
609 
610 	if (lfd->lfd_pos == MDS_DIR_END_OFF) {
611 		/*
612 		 * end-of-file.
613 		 */
614 		rc = 0;
615 		goto out;
616 	}
617 
618 	ctx->pos = lfd->lfd_pos;
619 	rc = ll_dir_read(inode, ctx);
620 	lfd->lfd_pos = ctx->pos;
621 	if (ctx->pos == MDS_DIR_END_OFF) {
622 		if (api32)
623 			ctx->pos = LL_DIR_END_OFF_32BIT;
624 		else
625 			ctx->pos = LL_DIR_END_OFF;
626 	} else {
627 		if (api32 && hash64)
628 			ctx->pos >>= 32;
629 	}
630 	filp->f_version = inode->i_version;
631 
632 out:
633 	if (!rc)
634 		ll_stats_ops_tally(sbi, LPROC_LL_READDIR, 1);
635 
636 	return rc;
637 }
638 
ll_send_mgc_param(struct obd_export * mgc,char * string)639 static int ll_send_mgc_param(struct obd_export *mgc, char *string)
640 {
641 	struct mgs_send_param *msp;
642 	int rc = 0;
643 
644 	msp = kzalloc(sizeof(*msp), GFP_NOFS);
645 	if (!msp)
646 		return -ENOMEM;
647 
648 	strncpy(msp->mgs_param, string, MGS_PARAM_MAXLEN);
649 	rc = obd_set_info_async(NULL, mgc, sizeof(KEY_SET_INFO), KEY_SET_INFO,
650 				sizeof(struct mgs_send_param), msp, NULL);
651 	if (rc)
652 		CERROR("Failed to set parameter: %d\n", rc);
653 	OBD_FREE_PTR(msp);
654 
655 	return rc;
656 }
657 
ll_dir_setdirstripe(struct inode * dir,struct lmv_user_md * lump,char * filename)658 static int ll_dir_setdirstripe(struct inode *dir, struct lmv_user_md *lump,
659 			       char *filename)
660 {
661 	struct ptlrpc_request *request = NULL;
662 	struct md_op_data *op_data;
663 	struct ll_sb_info *sbi = ll_i2sbi(dir);
664 	int mode;
665 	int err;
666 
667 	mode = (0755 & ~current_umask()) | S_IFDIR;
668 	op_data = ll_prep_md_op_data(NULL, dir, NULL, filename,
669 				     strlen(filename), mode, LUSTRE_OPC_MKDIR,
670 				     lump);
671 	if (IS_ERR(op_data)) {
672 		err = PTR_ERR(op_data);
673 		goto err_exit;
674 	}
675 
676 	op_data->op_cli_flags |= CLI_SET_MEA;
677 	err = md_create(sbi->ll_md_exp, op_data, lump, sizeof(*lump), mode,
678 			from_kuid(&init_user_ns, current_fsuid()),
679 			from_kgid(&init_user_ns, current_fsgid()),
680 			cfs_curproc_cap_pack(), 0, &request);
681 	ll_finish_md_op_data(op_data);
682 	if (err)
683 		goto err_exit;
684 err_exit:
685 	ptlrpc_req_finished(request);
686 	return err;
687 }
688 
ll_dir_setstripe(struct inode * inode,struct lov_user_md * lump,int set_default)689 int ll_dir_setstripe(struct inode *inode, struct lov_user_md *lump,
690 		     int set_default)
691 {
692 	struct ll_sb_info *sbi = ll_i2sbi(inode);
693 	struct md_op_data *op_data;
694 	struct ptlrpc_request *req = NULL;
695 	int rc = 0;
696 	struct lustre_sb_info *lsi = s2lsi(inode->i_sb);
697 	struct obd_device *mgc = lsi->lsi_mgc;
698 	int lum_size;
699 
700 	if (lump != NULL) {
701 		/*
702 		 * This is coming from userspace, so should be in
703 		 * local endian.  But the MDS would like it in little
704 		 * endian, so we swab it before we send it.
705 		 */
706 		switch (lump->lmm_magic) {
707 		case LOV_USER_MAGIC_V1: {
708 			if (lump->lmm_magic != cpu_to_le32(LOV_USER_MAGIC_V1))
709 				lustre_swab_lov_user_md_v1(lump);
710 			lum_size = sizeof(struct lov_user_md_v1);
711 			break;
712 		}
713 		case LOV_USER_MAGIC_V3: {
714 			if (lump->lmm_magic != cpu_to_le32(LOV_USER_MAGIC_V3))
715 				lustre_swab_lov_user_md_v3(
716 					(struct lov_user_md_v3 *)lump);
717 			lum_size = sizeof(struct lov_user_md_v3);
718 			break;
719 		}
720 		default: {
721 			CDEBUG(D_IOCTL, "bad userland LOV MAGIC: %#08x != %#08x nor %#08x\n",
722 			       lump->lmm_magic, LOV_USER_MAGIC_V1,
723 			       LOV_USER_MAGIC_V3);
724 			return -EINVAL;
725 		}
726 		}
727 	} else {
728 		lum_size = sizeof(struct lov_user_md_v1);
729 	}
730 
731 	op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
732 				     LUSTRE_OPC_ANY, NULL);
733 	if (IS_ERR(op_data))
734 		return PTR_ERR(op_data);
735 
736 	if (lump != NULL && lump->lmm_magic == cpu_to_le32(LMV_USER_MAGIC))
737 		op_data->op_cli_flags |= CLI_SET_MEA;
738 
739 	/* swabbing is done in lov_setstripe() on server side */
740 	rc = md_setattr(sbi->ll_md_exp, op_data, lump, lum_size,
741 			NULL, 0, &req, NULL);
742 	ll_finish_md_op_data(op_data);
743 	ptlrpc_req_finished(req);
744 	if (rc) {
745 		if (rc != -EPERM && rc != -EACCES)
746 			CERROR("mdc_setattr fails: rc = %d\n", rc);
747 	}
748 
749 	/* In the following we use the fact that LOV_USER_MAGIC_V1 and
750 	 LOV_USER_MAGIC_V3 have the same initial fields so we do not
751 	 need to make the distinction between the 2 versions */
752 	if (set_default && mgc->u.cli.cl_mgc_mgsexp) {
753 		char *param = NULL;
754 		char *buf;
755 
756 		param = kzalloc(MGS_PARAM_MAXLEN, GFP_NOFS);
757 		if (!param) {
758 			rc = -ENOMEM;
759 			goto end;
760 		}
761 
762 		buf = param;
763 		/* Get fsname and assume devname to be -MDT0000. */
764 		ll_get_fsname(inode->i_sb, buf, MTI_NAME_MAXLEN);
765 		strcat(buf, "-MDT0000.lov");
766 		buf += strlen(buf);
767 
768 		/* Set root stripesize */
769 		sprintf(buf, ".stripesize=%u",
770 			lump ? le32_to_cpu(lump->lmm_stripe_size) : 0);
771 		rc = ll_send_mgc_param(mgc->u.cli.cl_mgc_mgsexp, param);
772 		if (rc)
773 			goto end;
774 
775 		/* Set root stripecount */
776 		sprintf(buf, ".stripecount=%hd",
777 			lump ? le16_to_cpu(lump->lmm_stripe_count) : 0);
778 		rc = ll_send_mgc_param(mgc->u.cli.cl_mgc_mgsexp, param);
779 		if (rc)
780 			goto end;
781 
782 		/* Set root stripeoffset */
783 		sprintf(buf, ".stripeoffset=%hd",
784 			lump ? le16_to_cpu(lump->lmm_stripe_offset) :
785 			(typeof(lump->lmm_stripe_offset))(-1));
786 		rc = ll_send_mgc_param(mgc->u.cli.cl_mgc_mgsexp, param);
787 
788 end:
789 		if (param != NULL)
790 			OBD_FREE(param, MGS_PARAM_MAXLEN);
791 	}
792 	return rc;
793 }
794 
ll_dir_getstripe(struct inode * inode,struct lov_mds_md ** lmmp,int * lmm_size,struct ptlrpc_request ** request)795 int ll_dir_getstripe(struct inode *inode, struct lov_mds_md **lmmp,
796 		     int *lmm_size, struct ptlrpc_request **request)
797 {
798 	struct ll_sb_info *sbi = ll_i2sbi(inode);
799 	struct mdt_body   *body;
800 	struct lov_mds_md *lmm = NULL;
801 	struct ptlrpc_request *req = NULL;
802 	int rc, lmmsize;
803 	struct md_op_data *op_data;
804 
805 	rc = ll_get_default_mdsize(sbi, &lmmsize);
806 	if (rc)
807 		return rc;
808 
809 	op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL,
810 				     0, lmmsize, LUSTRE_OPC_ANY,
811 				     NULL);
812 	if (IS_ERR(op_data))
813 		return PTR_ERR(op_data);
814 
815 	op_data->op_valid = OBD_MD_FLEASIZE | OBD_MD_FLDIREA;
816 	rc = md_getattr(sbi->ll_md_exp, op_data, &req);
817 	ll_finish_md_op_data(op_data);
818 	if (rc < 0) {
819 		CDEBUG(D_INFO, "md_getattr failed on inode %lu/%u: rc %d\n",
820 		       inode->i_ino,
821 		       inode->i_generation, rc);
822 		goto out;
823 	}
824 
825 	body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
826 	LASSERT(body != NULL);
827 
828 	lmmsize = body->eadatasize;
829 
830 	if (!(body->valid & (OBD_MD_FLEASIZE | OBD_MD_FLDIREA)) ||
831 	    lmmsize == 0) {
832 		rc = -ENODATA;
833 		goto out;
834 	}
835 
836 	lmm = req_capsule_server_sized_get(&req->rq_pill,
837 					   &RMF_MDT_MD, lmmsize);
838 	LASSERT(lmm != NULL);
839 
840 	/*
841 	 * This is coming from the MDS, so is probably in
842 	 * little endian.  We convert it to host endian before
843 	 * passing it to userspace.
844 	 */
845 	/* We don't swab objects for directories */
846 	switch (le32_to_cpu(lmm->lmm_magic)) {
847 	case LOV_MAGIC_V1:
848 		if (LOV_MAGIC != cpu_to_le32(LOV_MAGIC))
849 			lustre_swab_lov_user_md_v1((struct lov_user_md_v1 *)lmm);
850 		break;
851 	case LOV_MAGIC_V3:
852 		if (LOV_MAGIC != cpu_to_le32(LOV_MAGIC))
853 			lustre_swab_lov_user_md_v3((struct lov_user_md_v3 *)lmm);
854 		break;
855 	default:
856 		CERROR("unknown magic: %lX\n", (unsigned long)lmm->lmm_magic);
857 		rc = -EPROTO;
858 	}
859 out:
860 	*lmmp = lmm;
861 	*lmm_size = lmmsize;
862 	*request = req;
863 	return rc;
864 }
865 
866 /*
867  *  Get MDT index for the inode.
868  */
ll_get_mdt_idx(struct inode * inode)869 int ll_get_mdt_idx(struct inode *inode)
870 {
871 	struct ll_sb_info *sbi = ll_i2sbi(inode);
872 	struct md_op_data *op_data;
873 	int rc, mdtidx;
874 
875 	op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0,
876 				     0, LUSTRE_OPC_ANY, NULL);
877 	if (IS_ERR(op_data))
878 		return PTR_ERR(op_data);
879 
880 	op_data->op_flags |= MF_GET_MDT_IDX;
881 	rc = md_getattr(sbi->ll_md_exp, op_data, NULL);
882 	mdtidx = op_data->op_mds;
883 	ll_finish_md_op_data(op_data);
884 	if (rc < 0) {
885 		CDEBUG(D_INFO, "md_getattr_name: %d\n", rc);
886 		return rc;
887 	}
888 	return mdtidx;
889 }
890 
891 /**
892  * Generic handler to do any pre-copy work.
893  *
894  * It send a first hsm_progress (with extent length == 0) to coordinator as a
895  * first information for it that real work has started.
896  *
897  * Moreover, for a ARCHIVE request, it will sample the file data version and
898  * store it in \a copy.
899  *
900  * \return 0 on success.
901  */
ll_ioc_copy_start(struct super_block * sb,struct hsm_copy * copy)902 static int ll_ioc_copy_start(struct super_block *sb, struct hsm_copy *copy)
903 {
904 	struct ll_sb_info		*sbi = ll_s2sbi(sb);
905 	struct hsm_progress_kernel	 hpk;
906 	int				 rc;
907 
908 	/* Forge a hsm_progress based on data from copy. */
909 	hpk.hpk_fid = copy->hc_hai.hai_fid;
910 	hpk.hpk_cookie = copy->hc_hai.hai_cookie;
911 	hpk.hpk_extent.offset = copy->hc_hai.hai_extent.offset;
912 	hpk.hpk_extent.length = 0;
913 	hpk.hpk_flags = 0;
914 	hpk.hpk_errval = 0;
915 	hpk.hpk_data_version = 0;
916 
917 
918 	/* For archive request, we need to read the current file version. */
919 	if (copy->hc_hai.hai_action == HSMA_ARCHIVE) {
920 		struct inode	*inode;
921 		__u64		 data_version = 0;
922 
923 		/* Get inode for this fid */
924 		inode = search_inode_for_lustre(sb, &copy->hc_hai.hai_fid);
925 		if (IS_ERR(inode)) {
926 			hpk.hpk_flags |= HP_FLAG_RETRY;
927 			/* hpk_errval is >= 0 */
928 			hpk.hpk_errval = -PTR_ERR(inode);
929 			rc = PTR_ERR(inode);
930 			goto progress;
931 		}
932 
933 		/* Read current file data version */
934 		rc = ll_data_version(inode, &data_version, 1);
935 		iput(inode);
936 		if (rc != 0) {
937 			CDEBUG(D_HSM, "Could not read file data version of "
938 				      DFID" (rc = %d). Archive request (%#llx) could not be done.\n",
939 				      PFID(&copy->hc_hai.hai_fid), rc,
940 				      copy->hc_hai.hai_cookie);
941 			hpk.hpk_flags |= HP_FLAG_RETRY;
942 			/* hpk_errval must be >= 0 */
943 			hpk.hpk_errval = -rc;
944 			goto progress;
945 		}
946 
947 		/* Store it the hsm_copy for later copytool use.
948 		 * Always modified even if no lsm. */
949 		copy->hc_data_version = data_version;
950 	}
951 
952 progress:
953 	rc = obd_iocontrol(LL_IOC_HSM_PROGRESS, sbi->ll_md_exp, sizeof(hpk),
954 			   &hpk, NULL);
955 
956 	return rc;
957 }
958 
959 /**
960  * Generic handler to do any post-copy work.
961  *
962  * It will send the last hsm_progress update to coordinator to inform it
963  * that copy is finished and whether it was successful or not.
964  *
965  * Moreover,
966  * - for ARCHIVE request, it will sample the file data version and compare it
967  *   with the version saved in ll_ioc_copy_start(). If they do not match, copy
968  *   will be considered as failed.
969  * - for RESTORE request, it will sample the file data version and send it to
970  *   coordinator which is useful if the file was imported as 'released'.
971  *
972  * \return 0 on success.
973  */
ll_ioc_copy_end(struct super_block * sb,struct hsm_copy * copy)974 static int ll_ioc_copy_end(struct super_block *sb, struct hsm_copy *copy)
975 {
976 	struct ll_sb_info		*sbi = ll_s2sbi(sb);
977 	struct hsm_progress_kernel	 hpk;
978 	int				 rc;
979 
980 	/* If you modify the logic here, also check llapi_hsm_copy_end(). */
981 	/* Take care: copy->hc_hai.hai_action, len, gid and data are not
982 	 * initialized if copy_end was called with copy == NULL.
983 	 */
984 
985 	/* Forge a hsm_progress based on data from copy. */
986 	hpk.hpk_fid = copy->hc_hai.hai_fid;
987 	hpk.hpk_cookie = copy->hc_hai.hai_cookie;
988 	hpk.hpk_extent = copy->hc_hai.hai_extent;
989 	hpk.hpk_flags = copy->hc_flags | HP_FLAG_COMPLETED;
990 	hpk.hpk_errval = copy->hc_errval;
991 	hpk.hpk_data_version = 0;
992 
993 	/* For archive request, we need to check the file data was not changed.
994 	 *
995 	 * For restore request, we need to send the file data version, this is
996 	 * useful when the file was created using hsm_import.
997 	 */
998 	if (((copy->hc_hai.hai_action == HSMA_ARCHIVE) ||
999 	     (copy->hc_hai.hai_action == HSMA_RESTORE)) &&
1000 	    (copy->hc_errval == 0)) {
1001 		struct inode	*inode;
1002 		__u64		 data_version = 0;
1003 
1004 		/* Get lsm for this fid */
1005 		inode = search_inode_for_lustre(sb, &copy->hc_hai.hai_fid);
1006 		if (IS_ERR(inode)) {
1007 			hpk.hpk_flags |= HP_FLAG_RETRY;
1008 			/* hpk_errval must be >= 0 */
1009 			hpk.hpk_errval = -PTR_ERR(inode);
1010 			rc = PTR_ERR(inode);
1011 			goto progress;
1012 		}
1013 
1014 		rc = ll_data_version(inode, &data_version,
1015 				     copy->hc_hai.hai_action == HSMA_ARCHIVE);
1016 		iput(inode);
1017 		if (rc) {
1018 			CDEBUG(D_HSM, "Could not read file data version. Request could not be confirmed.\n");
1019 			if (hpk.hpk_errval == 0)
1020 				hpk.hpk_errval = -rc;
1021 			goto progress;
1022 		}
1023 
1024 		/* Store it the hsm_copy for later copytool use.
1025 		 * Always modified even if no lsm. */
1026 		hpk.hpk_data_version = data_version;
1027 
1028 		/* File could have been stripped during archiving, so we need
1029 		 * to check anyway. */
1030 		if ((copy->hc_hai.hai_action == HSMA_ARCHIVE) &&
1031 		    (copy->hc_data_version != data_version)) {
1032 			CDEBUG(D_HSM, "File data version mismatched. File content was changed during archiving. "
1033 			       DFID", start:%#llx current:%#llx\n",
1034 			       PFID(&copy->hc_hai.hai_fid),
1035 			       copy->hc_data_version, data_version);
1036 			/* File was changed, send error to cdt. Do not ask for
1037 			 * retry because if a file is modified frequently,
1038 			 * the cdt will loop on retried archive requests.
1039 			 * The policy engine will ask for a new archive later
1040 			 * when the file will not be modified for some tunable
1041 			 * time */
1042 			/* we do not notify caller */
1043 			hpk.hpk_flags &= ~HP_FLAG_RETRY;
1044 			/* hpk_errval must be >= 0 */
1045 			hpk.hpk_errval = EBUSY;
1046 		}
1047 
1048 	}
1049 
1050 progress:
1051 	rc = obd_iocontrol(LL_IOC_HSM_PROGRESS, sbi->ll_md_exp, sizeof(hpk),
1052 			   &hpk, NULL);
1053 
1054 	return rc;
1055 }
1056 
1057 
copy_and_ioctl(int cmd,struct obd_export * exp,const void __user * data,size_t size)1058 static int copy_and_ioctl(int cmd, struct obd_export *exp,
1059 			  const void __user *data, size_t size)
1060 {
1061 	void *copy;
1062 	int rc;
1063 
1064 	copy = kzalloc(size, GFP_NOFS);
1065 	if (!copy)
1066 		return -ENOMEM;
1067 
1068 	if (copy_from_user(copy, data, size)) {
1069 		rc = -EFAULT;
1070 		goto out;
1071 	}
1072 
1073 	rc = obd_iocontrol(cmd, exp, size, copy, NULL);
1074 out:
1075 	OBD_FREE(copy, size);
1076 
1077 	return rc;
1078 }
1079 
quotactl_ioctl(struct ll_sb_info * sbi,struct if_quotactl * qctl)1080 static int quotactl_ioctl(struct ll_sb_info *sbi, struct if_quotactl *qctl)
1081 {
1082 	int cmd = qctl->qc_cmd;
1083 	int type = qctl->qc_type;
1084 	int id = qctl->qc_id;
1085 	int valid = qctl->qc_valid;
1086 	int rc = 0;
1087 
1088 	switch (cmd) {
1089 	case LUSTRE_Q_INVALIDATE:
1090 	case LUSTRE_Q_FINVALIDATE:
1091 	case Q_QUOTAON:
1092 	case Q_QUOTAOFF:
1093 	case Q_SETQUOTA:
1094 	case Q_SETINFO:
1095 		if (!capable(CFS_CAP_SYS_ADMIN) ||
1096 		    sbi->ll_flags & LL_SBI_RMT_CLIENT)
1097 			return -EPERM;
1098 		break;
1099 	case Q_GETQUOTA:
1100 		if (((type == USRQUOTA &&
1101 		      !uid_eq(current_euid(), make_kuid(&init_user_ns, id))) ||
1102 		     (type == GRPQUOTA &&
1103 		      !in_egroup_p(make_kgid(&init_user_ns, id)))) &&
1104 		    (!capable(CFS_CAP_SYS_ADMIN) ||
1105 		     sbi->ll_flags & LL_SBI_RMT_CLIENT))
1106 			return -EPERM;
1107 		break;
1108 	case Q_GETINFO:
1109 		break;
1110 	default:
1111 		CERROR("unsupported quotactl op: %#x\n", cmd);
1112 		return -ENOTTY;
1113 	}
1114 
1115 	if (valid != QC_GENERAL) {
1116 		if (sbi->ll_flags & LL_SBI_RMT_CLIENT)
1117 			return -EOPNOTSUPP;
1118 
1119 		if (cmd == Q_GETINFO)
1120 			qctl->qc_cmd = Q_GETOINFO;
1121 		else if (cmd == Q_GETQUOTA)
1122 			qctl->qc_cmd = Q_GETOQUOTA;
1123 		else
1124 			return -EINVAL;
1125 
1126 		switch (valid) {
1127 		case QC_MDTIDX:
1128 			rc = obd_iocontrol(OBD_IOC_QUOTACTL, sbi->ll_md_exp,
1129 					   sizeof(*qctl), qctl, NULL);
1130 			break;
1131 		case QC_OSTIDX:
1132 			rc = obd_iocontrol(OBD_IOC_QUOTACTL, sbi->ll_dt_exp,
1133 					   sizeof(*qctl), qctl, NULL);
1134 			break;
1135 		case QC_UUID:
1136 			rc = obd_iocontrol(OBD_IOC_QUOTACTL, sbi->ll_md_exp,
1137 					   sizeof(*qctl), qctl, NULL);
1138 			if (rc == -EAGAIN)
1139 				rc = obd_iocontrol(OBD_IOC_QUOTACTL,
1140 						   sbi->ll_dt_exp,
1141 						   sizeof(*qctl), qctl, NULL);
1142 			break;
1143 		default:
1144 			rc = -EINVAL;
1145 			break;
1146 		}
1147 
1148 		if (rc)
1149 			return rc;
1150 
1151 		qctl->qc_cmd = cmd;
1152 	} else {
1153 		struct obd_quotactl *oqctl;
1154 
1155 		oqctl = kzalloc(sizeof(*oqctl), GFP_NOFS);
1156 		if (!oqctl)
1157 			return -ENOMEM;
1158 
1159 		QCTL_COPY(oqctl, qctl);
1160 		rc = obd_quotactl(sbi->ll_md_exp, oqctl);
1161 		if (rc) {
1162 			if (rc != -EALREADY && cmd == Q_QUOTAON) {
1163 				oqctl->qc_cmd = Q_QUOTAOFF;
1164 				obd_quotactl(sbi->ll_md_exp, oqctl);
1165 			}
1166 			OBD_FREE_PTR(oqctl);
1167 			return rc;
1168 		}
1169 		/* If QIF_SPACE is not set, client should collect the
1170 		 * space usage from OSSs by itself */
1171 		if (cmd == Q_GETQUOTA &&
1172 		    !(oqctl->qc_dqblk.dqb_valid & QIF_SPACE) &&
1173 		    !oqctl->qc_dqblk.dqb_curspace) {
1174 			struct obd_quotactl *oqctl_tmp;
1175 
1176 			oqctl_tmp = kzalloc(sizeof(*oqctl_tmp), GFP_NOFS);
1177 			if (!oqctl_tmp) {
1178 				rc = -ENOMEM;
1179 				goto out;
1180 			}
1181 
1182 			oqctl_tmp->qc_cmd = Q_GETOQUOTA;
1183 			oqctl_tmp->qc_id = oqctl->qc_id;
1184 			oqctl_tmp->qc_type = oqctl->qc_type;
1185 
1186 			/* collect space usage from OSTs */
1187 			oqctl_tmp->qc_dqblk.dqb_curspace = 0;
1188 			rc = obd_quotactl(sbi->ll_dt_exp, oqctl_tmp);
1189 			if (!rc || rc == -EREMOTEIO) {
1190 				oqctl->qc_dqblk.dqb_curspace =
1191 					oqctl_tmp->qc_dqblk.dqb_curspace;
1192 				oqctl->qc_dqblk.dqb_valid |= QIF_SPACE;
1193 			}
1194 
1195 			/* collect space & inode usage from MDTs */
1196 			oqctl_tmp->qc_dqblk.dqb_curspace = 0;
1197 			oqctl_tmp->qc_dqblk.dqb_curinodes = 0;
1198 			rc = obd_quotactl(sbi->ll_md_exp, oqctl_tmp);
1199 			if (!rc || rc == -EREMOTEIO) {
1200 				oqctl->qc_dqblk.dqb_curspace +=
1201 					oqctl_tmp->qc_dqblk.dqb_curspace;
1202 				oqctl->qc_dqblk.dqb_curinodes =
1203 					oqctl_tmp->qc_dqblk.dqb_curinodes;
1204 				oqctl->qc_dqblk.dqb_valid |= QIF_INODES;
1205 			} else {
1206 				oqctl->qc_dqblk.dqb_valid &= ~QIF_SPACE;
1207 			}
1208 
1209 			OBD_FREE_PTR(oqctl_tmp);
1210 		}
1211 out:
1212 		QCTL_COPY(qctl, oqctl);
1213 		OBD_FREE_PTR(oqctl);
1214 	}
1215 
1216 	return rc;
1217 }
1218 
1219 static char *
ll_getname(const char __user * filename)1220 ll_getname(const char __user *filename)
1221 {
1222 	int ret = 0, len;
1223 	char *tmp = __getname();
1224 
1225 	if (!tmp)
1226 		return ERR_PTR(-ENOMEM);
1227 
1228 	len = strncpy_from_user(tmp, filename, PATH_MAX);
1229 	if (len == 0)
1230 		ret = -ENOENT;
1231 	else if (len > PATH_MAX)
1232 		ret = -ENAMETOOLONG;
1233 
1234 	if (ret) {
1235 		__putname(tmp);
1236 		tmp =  ERR_PTR(ret);
1237 	}
1238 	return tmp;
1239 }
1240 
1241 #define ll_putname(filename) __putname(filename)
1242 
ll_dir_ioctl(struct file * file,unsigned int cmd,unsigned long arg)1243 static long ll_dir_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
1244 {
1245 	struct inode *inode = file_inode(file);
1246 	struct ll_sb_info *sbi = ll_i2sbi(inode);
1247 	struct obd_ioctl_data *data;
1248 	int rc = 0;
1249 
1250 	CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), cmd=%#x\n",
1251 	       inode->i_ino, inode->i_generation, inode, cmd);
1252 
1253 	/* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */
1254 	if (_IOC_TYPE(cmd) == 'T' || _IOC_TYPE(cmd) == 't') /* tty ioctls */
1255 		return -ENOTTY;
1256 
1257 	ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_IOCTL, 1);
1258 	switch (cmd) {
1259 	case FSFILT_IOC_GETFLAGS:
1260 	case FSFILT_IOC_SETFLAGS:
1261 		return ll_iocontrol(inode, file, cmd, arg);
1262 	case FSFILT_IOC_GETVERSION_OLD:
1263 	case FSFILT_IOC_GETVERSION:
1264 		return put_user(inode->i_generation, (int *)arg);
1265 	/* We need to special case any other ioctls we want to handle,
1266 	 * to send them to the MDS/OST as appropriate and to properly
1267 	 * network encode the arg field.
1268 	case FSFILT_IOC_SETVERSION_OLD:
1269 	case FSFILT_IOC_SETVERSION:
1270 	*/
1271 	case LL_IOC_GET_MDTIDX: {
1272 		int mdtidx;
1273 
1274 		mdtidx = ll_get_mdt_idx(inode);
1275 		if (mdtidx < 0)
1276 			return mdtidx;
1277 
1278 		if (put_user((int)mdtidx, (int *)arg))
1279 			return -EFAULT;
1280 
1281 		return 0;
1282 	}
1283 	case IOC_MDC_LOOKUP: {
1284 		struct ptlrpc_request *request = NULL;
1285 		int namelen, len = 0;
1286 		char *buf = NULL;
1287 		char *filename;
1288 		struct md_op_data *op_data;
1289 
1290 		rc = obd_ioctl_getdata(&buf, &len, (void *)arg);
1291 		if (rc)
1292 			return rc;
1293 		data = (void *)buf;
1294 
1295 		filename = data->ioc_inlbuf1;
1296 		namelen = strlen(filename);
1297 
1298 		if (namelen < 1) {
1299 			CDEBUG(D_INFO, "IOC_MDC_LOOKUP missing filename\n");
1300 			rc = -EINVAL;
1301 			goto out_free;
1302 		}
1303 
1304 		op_data = ll_prep_md_op_data(NULL, inode, NULL, filename, namelen,
1305 					     0, LUSTRE_OPC_ANY, NULL);
1306 		if (IS_ERR(op_data)) {
1307 			rc = PTR_ERR(op_data);
1308 			goto out_free;
1309 		}
1310 
1311 		op_data->op_valid = OBD_MD_FLID;
1312 		rc = md_getattr_name(sbi->ll_md_exp, op_data, &request);
1313 		ll_finish_md_op_data(op_data);
1314 		if (rc < 0) {
1315 			CDEBUG(D_INFO, "md_getattr_name: %d\n", rc);
1316 			goto out_free;
1317 		}
1318 		ptlrpc_req_finished(request);
1319 out_free:
1320 		obd_ioctl_freedata(buf, len);
1321 		return rc;
1322 	}
1323 	case LL_IOC_LMV_SETSTRIPE: {
1324 		struct lmv_user_md  *lum;
1325 		char		*buf = NULL;
1326 		char		*filename;
1327 		int		 namelen = 0;
1328 		int		 lumlen = 0;
1329 		int		 len;
1330 		int		 rc;
1331 
1332 		rc = obd_ioctl_getdata(&buf, &len, (void *)arg);
1333 		if (rc)
1334 			return rc;
1335 
1336 		data = (void *)buf;
1337 		if (data->ioc_inlbuf1 == NULL || data->ioc_inlbuf2 == NULL ||
1338 		    data->ioc_inllen1 == 0 || data->ioc_inllen2 == 0) {
1339 			rc = -EINVAL;
1340 			goto lmv_out_free;
1341 		}
1342 
1343 		filename = data->ioc_inlbuf1;
1344 		namelen = data->ioc_inllen1;
1345 
1346 		if (namelen < 1) {
1347 			CDEBUG(D_INFO, "IOC_MDC_LOOKUP missing filename\n");
1348 			rc = -EINVAL;
1349 			goto lmv_out_free;
1350 		}
1351 		lum = (struct lmv_user_md *)data->ioc_inlbuf2;
1352 		lumlen = data->ioc_inllen2;
1353 
1354 		if (lum->lum_magic != LMV_USER_MAGIC ||
1355 		    lumlen != sizeof(*lum)) {
1356 			CERROR("%s: wrong lum magic %x or size %d: rc = %d\n",
1357 			       filename, lum->lum_magic, lumlen, -EFAULT);
1358 			rc = -EINVAL;
1359 			goto lmv_out_free;
1360 		}
1361 
1362 		/**
1363 		 * ll_dir_setdirstripe will be used to set dir stripe
1364 		 *  mdc_create--->mdt_reint_create (with dirstripe)
1365 		 */
1366 		rc = ll_dir_setdirstripe(inode, lum, filename);
1367 lmv_out_free:
1368 		obd_ioctl_freedata(buf, len);
1369 		return rc;
1370 
1371 	}
1372 	case LL_IOC_LOV_SETSTRIPE: {
1373 		struct lov_user_md_v3 lumv3;
1374 		struct lov_user_md_v1 *lumv1 = (struct lov_user_md_v1 *)&lumv3;
1375 		struct lov_user_md_v1 *lumv1p = (struct lov_user_md_v1 *)arg;
1376 		struct lov_user_md_v3 *lumv3p = (struct lov_user_md_v3 *)arg;
1377 
1378 		int set_default = 0;
1379 
1380 		LASSERT(sizeof(lumv3) == sizeof(*lumv3p));
1381 		LASSERT(sizeof(lumv3.lmm_objects[0]) ==
1382 			sizeof(lumv3p->lmm_objects[0]));
1383 		/* first try with v1 which is smaller than v3 */
1384 		if (copy_from_user(lumv1, lumv1p, sizeof(*lumv1)))
1385 			return -EFAULT;
1386 
1387 		if (lumv1->lmm_magic == LOV_USER_MAGIC_V3) {
1388 			if (copy_from_user(&lumv3, lumv3p, sizeof(lumv3)))
1389 				return -EFAULT;
1390 		}
1391 
1392 		if (is_root_inode(inode))
1393 			set_default = 1;
1394 
1395 		/* in v1 and v3 cases lumv1 points to data */
1396 		rc = ll_dir_setstripe(inode, lumv1, set_default);
1397 
1398 		return rc;
1399 	}
1400 	case LL_IOC_LMV_GETSTRIPE: {
1401 		struct lmv_user_md *lump = (struct lmv_user_md *)arg;
1402 		struct lmv_user_md lum;
1403 		struct lmv_user_md *tmp;
1404 		int lum_size;
1405 		int rc = 0;
1406 		int mdtindex;
1407 
1408 		if (copy_from_user(&lum, lump, sizeof(struct lmv_user_md)))
1409 			return -EFAULT;
1410 
1411 		if (lum.lum_magic != LMV_MAGIC_V1)
1412 			return -EINVAL;
1413 
1414 		lum_size = lmv_user_md_size(1, LMV_MAGIC_V1);
1415 		tmp = kzalloc(lum_size, GFP_NOFS);
1416 		if (!tmp) {
1417 			rc = -ENOMEM;
1418 			goto free_lmv;
1419 		}
1420 
1421 		*tmp = lum;
1422 		tmp->lum_type = LMV_STRIPE_TYPE;
1423 		tmp->lum_stripe_count = 1;
1424 		mdtindex = ll_get_mdt_idx(inode);
1425 		if (mdtindex < 0) {
1426 			rc = -ENOMEM;
1427 			goto free_lmv;
1428 		}
1429 
1430 		tmp->lum_stripe_offset = mdtindex;
1431 		tmp->lum_objects[0].lum_mds = mdtindex;
1432 		memcpy(&tmp->lum_objects[0].lum_fid, ll_inode2fid(inode),
1433 		       sizeof(struct lu_fid));
1434 		if (copy_to_user((void *)arg, tmp, lum_size)) {
1435 			rc = -EFAULT;
1436 			goto free_lmv;
1437 		}
1438 free_lmv:
1439 		if (tmp)
1440 			OBD_FREE(tmp, lum_size);
1441 		return rc;
1442 	}
1443 	case LL_IOC_REMOVE_ENTRY: {
1444 		char		*filename = NULL;
1445 		int		 namelen = 0;
1446 		int		 rc;
1447 
1448 		/* Here is a little hack to avoid sending REINT_RMENTRY to
1449 		 * unsupported server, which might crash the server(LU-2730),
1450 		 * Because both LVB_TYPE and REINT_RMENTRY will be supported
1451 		 * on 2.4, we use OBD_CONNECT_LVB_TYPE to detect whether the
1452 		 * server will support REINT_RMENTRY XXX*/
1453 		if (!(exp_connect_flags(sbi->ll_md_exp) & OBD_CONNECT_LVB_TYPE))
1454 			return -ENOTSUPP;
1455 
1456 		filename = ll_getname((const char *)arg);
1457 		if (IS_ERR(filename))
1458 			return PTR_ERR(filename);
1459 
1460 		namelen = strlen(filename);
1461 		if (namelen < 1) {
1462 			rc = -EINVAL;
1463 			goto out_rmdir;
1464 		}
1465 
1466 		rc = ll_rmdir_entry(inode, filename, namelen);
1467 out_rmdir:
1468 		if (filename)
1469 			ll_putname(filename);
1470 		return rc;
1471 	}
1472 	case LL_IOC_LOV_SWAP_LAYOUTS:
1473 		return -EPERM;
1474 	case LL_IOC_OBD_STATFS:
1475 		return ll_obd_statfs(inode, (void *)arg);
1476 	case LL_IOC_LOV_GETSTRIPE:
1477 	case LL_IOC_MDC_GETINFO:
1478 	case IOC_MDC_GETFILEINFO:
1479 	case IOC_MDC_GETFILESTRIPE: {
1480 		struct ptlrpc_request *request = NULL;
1481 		struct lov_user_md *lump;
1482 		struct lov_mds_md *lmm = NULL;
1483 		struct mdt_body *body;
1484 		char *filename = NULL;
1485 		int lmmsize;
1486 
1487 		if (cmd == IOC_MDC_GETFILEINFO ||
1488 		    cmd == IOC_MDC_GETFILESTRIPE) {
1489 			filename = ll_getname((const char *)arg);
1490 			if (IS_ERR(filename))
1491 				return PTR_ERR(filename);
1492 
1493 			rc = ll_lov_getstripe_ea_info(inode, filename, &lmm,
1494 						      &lmmsize, &request);
1495 		} else {
1496 			rc = ll_dir_getstripe(inode, &lmm, &lmmsize, &request);
1497 		}
1498 
1499 		if (request) {
1500 			body = req_capsule_server_get(&request->rq_pill,
1501 						      &RMF_MDT_BODY);
1502 			LASSERT(body != NULL);
1503 		} else {
1504 			goto out_req;
1505 		}
1506 
1507 		if (rc < 0) {
1508 			if (rc == -ENODATA && (cmd == IOC_MDC_GETFILEINFO ||
1509 					       cmd == LL_IOC_MDC_GETINFO)) {
1510 				rc = 0;
1511 				goto skip_lmm;
1512 			} else
1513 				goto out_req;
1514 		}
1515 
1516 		if (cmd == IOC_MDC_GETFILESTRIPE ||
1517 		    cmd == LL_IOC_LOV_GETSTRIPE) {
1518 			lump = (struct lov_user_md *)arg;
1519 		} else {
1520 			struct lov_user_mds_data *lmdp;
1521 
1522 			lmdp = (struct lov_user_mds_data *)arg;
1523 			lump = &lmdp->lmd_lmm;
1524 		}
1525 		if (copy_to_user(lump, lmm, lmmsize)) {
1526 			if (copy_to_user(lump, lmm, sizeof(*lump))) {
1527 				rc = -EFAULT;
1528 				goto out_req;
1529 			}
1530 			rc = -EOVERFLOW;
1531 		}
1532 skip_lmm:
1533 		if (cmd == IOC_MDC_GETFILEINFO || cmd == LL_IOC_MDC_GETINFO) {
1534 			struct lov_user_mds_data *lmdp;
1535 			lstat_t st = { 0 };
1536 
1537 			st.st_dev     = inode->i_sb->s_dev;
1538 			st.st_mode    = body->mode;
1539 			st.st_nlink   = body->nlink;
1540 			st.st_uid     = body->uid;
1541 			st.st_gid     = body->gid;
1542 			st.st_rdev    = body->rdev;
1543 			st.st_size    = body->size;
1544 			st.st_blksize = PAGE_CACHE_SIZE;
1545 			st.st_blocks  = body->blocks;
1546 			st.st_atime   = body->atime;
1547 			st.st_mtime   = body->mtime;
1548 			st.st_ctime   = body->ctime;
1549 			st.st_ino     = inode->i_ino;
1550 
1551 			lmdp = (struct lov_user_mds_data *)arg;
1552 			if (copy_to_user(&lmdp->lmd_st, &st, sizeof(st))) {
1553 				rc = -EFAULT;
1554 				goto out_req;
1555 			}
1556 		}
1557 
1558 out_req:
1559 		ptlrpc_req_finished(request);
1560 		if (filename)
1561 			ll_putname(filename);
1562 		return rc;
1563 	}
1564 	case IOC_LOV_GETINFO: {
1565 		struct lov_user_mds_data *lumd;
1566 		struct lov_stripe_md *lsm;
1567 		struct lov_user_md *lum;
1568 		struct lov_mds_md *lmm;
1569 		int lmmsize;
1570 		lstat_t st;
1571 
1572 		lumd = (struct lov_user_mds_data *)arg;
1573 		lum = &lumd->lmd_lmm;
1574 
1575 		rc = ll_get_max_mdsize(sbi, &lmmsize);
1576 		if (rc)
1577 			return rc;
1578 
1579 		OBD_ALLOC_LARGE(lmm, lmmsize);
1580 		if (lmm == NULL)
1581 			return -ENOMEM;
1582 		if (copy_from_user(lmm, lum, lmmsize)) {
1583 			rc = -EFAULT;
1584 			goto free_lmm;
1585 		}
1586 
1587 		switch (lmm->lmm_magic) {
1588 		case LOV_USER_MAGIC_V1:
1589 			if (LOV_USER_MAGIC_V1 == cpu_to_le32(LOV_USER_MAGIC_V1))
1590 				break;
1591 			/* swab objects first so that stripes num will be sane */
1592 			lustre_swab_lov_user_md_objects(
1593 				((struct lov_user_md_v1 *)lmm)->lmm_objects,
1594 				((struct lov_user_md_v1 *)lmm)->lmm_stripe_count);
1595 			lustre_swab_lov_user_md_v1((struct lov_user_md_v1 *)lmm);
1596 			break;
1597 		case LOV_USER_MAGIC_V3:
1598 			if (LOV_USER_MAGIC_V3 == cpu_to_le32(LOV_USER_MAGIC_V3))
1599 				break;
1600 			/* swab objects first so that stripes num will be sane */
1601 			lustre_swab_lov_user_md_objects(
1602 				((struct lov_user_md_v3 *)lmm)->lmm_objects,
1603 				((struct lov_user_md_v3 *)lmm)->lmm_stripe_count);
1604 			lustre_swab_lov_user_md_v3((struct lov_user_md_v3 *)lmm);
1605 			break;
1606 		default:
1607 			rc = -EINVAL;
1608 			goto free_lmm;
1609 		}
1610 
1611 		rc = obd_unpackmd(sbi->ll_dt_exp, &lsm, lmm, lmmsize);
1612 		if (rc < 0) {
1613 			rc = -ENOMEM;
1614 			goto free_lmm;
1615 		}
1616 
1617 		/* Perform glimpse_size operation. */
1618 		memset(&st, 0, sizeof(st));
1619 
1620 		rc = ll_glimpse_ioctl(sbi, lsm, &st);
1621 		if (rc)
1622 			goto free_lsm;
1623 
1624 		if (copy_to_user(&lumd->lmd_st, &st, sizeof(st))) {
1625 			rc = -EFAULT;
1626 			goto free_lsm;
1627 		}
1628 
1629 free_lsm:
1630 		obd_free_memmd(sbi->ll_dt_exp, &lsm);
1631 free_lmm:
1632 		OBD_FREE_LARGE(lmm, lmmsize);
1633 		return rc;
1634 	}
1635 	case OBD_IOC_LLOG_CATINFO: {
1636 		return -EOPNOTSUPP;
1637 	}
1638 	case OBD_IOC_QUOTACHECK: {
1639 		struct obd_quotactl *oqctl;
1640 		int error = 0;
1641 
1642 		if (!capable(CFS_CAP_SYS_ADMIN) ||
1643 		    sbi->ll_flags & LL_SBI_RMT_CLIENT)
1644 			return -EPERM;
1645 
1646 		oqctl = kzalloc(sizeof(*oqctl), GFP_NOFS);
1647 		if (!oqctl)
1648 			return -ENOMEM;
1649 		oqctl->qc_type = arg;
1650 		rc = obd_quotacheck(sbi->ll_md_exp, oqctl);
1651 		if (rc < 0) {
1652 			CDEBUG(D_INFO, "md_quotacheck failed: rc %d\n", rc);
1653 			error = rc;
1654 		}
1655 
1656 		rc = obd_quotacheck(sbi->ll_dt_exp, oqctl);
1657 		if (rc < 0)
1658 			CDEBUG(D_INFO, "obd_quotacheck failed: rc %d\n", rc);
1659 
1660 		OBD_FREE_PTR(oqctl);
1661 		return error ?: rc;
1662 	}
1663 	case OBD_IOC_POLL_QUOTACHECK: {
1664 		struct if_quotacheck *check;
1665 
1666 		if (!capable(CFS_CAP_SYS_ADMIN) ||
1667 		    sbi->ll_flags & LL_SBI_RMT_CLIENT)
1668 			return -EPERM;
1669 
1670 		check = kzalloc(sizeof(*check), GFP_NOFS);
1671 		if (!check)
1672 			return -ENOMEM;
1673 
1674 		rc = obd_iocontrol(cmd, sbi->ll_md_exp, 0, (void *)check,
1675 				   NULL);
1676 		if (rc) {
1677 			CDEBUG(D_QUOTA, "mdc ioctl %d failed: %d\n", cmd, rc);
1678 			if (copy_to_user((void *)arg, check,
1679 					     sizeof(*check)))
1680 				CDEBUG(D_QUOTA, "copy_to_user failed\n");
1681 			goto out_poll;
1682 		}
1683 
1684 		rc = obd_iocontrol(cmd, sbi->ll_dt_exp, 0, (void *)check,
1685 				   NULL);
1686 		if (rc) {
1687 			CDEBUG(D_QUOTA, "osc ioctl %d failed: %d\n", cmd, rc);
1688 			if (copy_to_user((void *)arg, check,
1689 					     sizeof(*check)))
1690 				CDEBUG(D_QUOTA, "copy_to_user failed\n");
1691 			goto out_poll;
1692 		}
1693 out_poll:
1694 		OBD_FREE_PTR(check);
1695 		return rc;
1696 	}
1697 	case LL_IOC_QUOTACTL: {
1698 		struct if_quotactl *qctl;
1699 
1700 		qctl = kzalloc(sizeof(*qctl), GFP_NOFS);
1701 		if (!qctl)
1702 			return -ENOMEM;
1703 
1704 		if (copy_from_user(qctl, (void *)arg, sizeof(*qctl))) {
1705 			rc = -EFAULT;
1706 			goto out_quotactl;
1707 		}
1708 
1709 		rc = quotactl_ioctl(sbi, qctl);
1710 
1711 		if (rc == 0 && copy_to_user((void *)arg, qctl, sizeof(*qctl)))
1712 			rc = -EFAULT;
1713 
1714 out_quotactl:
1715 		OBD_FREE_PTR(qctl);
1716 		return rc;
1717 	}
1718 	case OBD_IOC_GETDTNAME:
1719 	case OBD_IOC_GETMDNAME:
1720 		return ll_get_obd_name(inode, cmd, arg);
1721 	case LL_IOC_FLUSHCTX:
1722 		return ll_flush_ctx(inode);
1723 #ifdef CONFIG_FS_POSIX_ACL
1724 	case LL_IOC_RMTACL: {
1725 	    if (sbi->ll_flags & LL_SBI_RMT_CLIENT && is_root_inode(inode)) {
1726 		struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1727 
1728 		LASSERT(fd != NULL);
1729 		rc = rct_add(&sbi->ll_rct, current_pid(), arg);
1730 		if (!rc)
1731 			fd->fd_flags |= LL_FILE_RMTACL;
1732 		return rc;
1733 	    } else
1734 		return 0;
1735 	}
1736 #endif
1737 	case LL_IOC_GETOBDCOUNT: {
1738 		int count, vallen;
1739 		struct obd_export *exp;
1740 
1741 		if (copy_from_user(&count, (int *)arg, sizeof(int)))
1742 			return -EFAULT;
1743 
1744 		/* get ost count when count is zero, get mdt count otherwise */
1745 		exp = count ? sbi->ll_md_exp : sbi->ll_dt_exp;
1746 		vallen = sizeof(count);
1747 		rc = obd_get_info(NULL, exp, sizeof(KEY_TGT_COUNT),
1748 				  KEY_TGT_COUNT, &vallen, &count, NULL);
1749 		if (rc) {
1750 			CERROR("get target count failed: %d\n", rc);
1751 			return rc;
1752 		}
1753 
1754 		if (copy_to_user((int *)arg, &count, sizeof(int)))
1755 			return -EFAULT;
1756 
1757 		return 0;
1758 	}
1759 	case LL_IOC_PATH2FID:
1760 		if (copy_to_user((void *)arg, ll_inode2fid(inode),
1761 				     sizeof(struct lu_fid)))
1762 			return -EFAULT;
1763 		return 0;
1764 	case LL_IOC_GET_CONNECT_FLAGS: {
1765 		return obd_iocontrol(cmd, sbi->ll_md_exp, 0, NULL, (void *)arg);
1766 	}
1767 	case OBD_IOC_CHANGELOG_SEND:
1768 	case OBD_IOC_CHANGELOG_CLEAR:
1769 		rc = copy_and_ioctl(cmd, sbi->ll_md_exp, (void *)arg,
1770 				    sizeof(struct ioc_changelog));
1771 		return rc;
1772 	case OBD_IOC_FID2PATH:
1773 		return ll_fid2path(inode, (void *)arg);
1774 	case LL_IOC_HSM_REQUEST: {
1775 		struct hsm_user_request	*hur;
1776 		ssize_t			 totalsize;
1777 
1778 		hur = kzalloc(sizeof(*hur), GFP_NOFS);
1779 		if (!hur)
1780 			return -ENOMEM;
1781 
1782 		/* We don't know the true size yet; copy the fixed-size part */
1783 		if (copy_from_user(hur, (void *)arg, sizeof(*hur))) {
1784 			OBD_FREE_PTR(hur);
1785 			return -EFAULT;
1786 		}
1787 
1788 		/* Compute the whole struct size */
1789 		totalsize = hur_len(hur);
1790 		OBD_FREE_PTR(hur);
1791 		if (totalsize < 0)
1792 			return -E2BIG;
1793 
1794 		/* Final size will be more than double totalsize */
1795 		if (totalsize >= MDS_MAXREQSIZE / 3)
1796 			return -E2BIG;
1797 
1798 		OBD_ALLOC_LARGE(hur, totalsize);
1799 		if (hur == NULL)
1800 			return -ENOMEM;
1801 
1802 		/* Copy the whole struct */
1803 		if (copy_from_user(hur, (void *)arg, totalsize)) {
1804 			OBD_FREE_LARGE(hur, totalsize);
1805 			return -EFAULT;
1806 		}
1807 
1808 		if (hur->hur_request.hr_action == HUA_RELEASE) {
1809 			const struct lu_fid *fid;
1810 			struct inode *f;
1811 			int i;
1812 
1813 			for (i = 0; i < hur->hur_request.hr_itemcount; i++) {
1814 				fid = &hur->hur_user_item[i].hui_fid;
1815 				f = search_inode_for_lustre(inode->i_sb, fid);
1816 				if (IS_ERR(f)) {
1817 					rc = PTR_ERR(f);
1818 					break;
1819 				}
1820 
1821 				rc = ll_hsm_release(f);
1822 				iput(f);
1823 				if (rc != 0)
1824 					break;
1825 			}
1826 		} else {
1827 			rc = obd_iocontrol(cmd, ll_i2mdexp(inode), totalsize,
1828 					   hur, NULL);
1829 		}
1830 
1831 		OBD_FREE_LARGE(hur, totalsize);
1832 
1833 		return rc;
1834 	}
1835 	case LL_IOC_HSM_PROGRESS: {
1836 		struct hsm_progress_kernel	hpk;
1837 		struct hsm_progress		hp;
1838 
1839 		if (copy_from_user(&hp, (void *)arg, sizeof(hp)))
1840 			return -EFAULT;
1841 
1842 		hpk.hpk_fid = hp.hp_fid;
1843 		hpk.hpk_cookie = hp.hp_cookie;
1844 		hpk.hpk_extent = hp.hp_extent;
1845 		hpk.hpk_flags = hp.hp_flags;
1846 		hpk.hpk_errval = hp.hp_errval;
1847 		hpk.hpk_data_version = 0;
1848 
1849 		/* File may not exist in Lustre; all progress
1850 		 * reported to Lustre root */
1851 		rc = obd_iocontrol(cmd, sbi->ll_md_exp, sizeof(hpk), &hpk,
1852 				   NULL);
1853 		return rc;
1854 	}
1855 	case LL_IOC_HSM_CT_START:
1856 		rc = copy_and_ioctl(cmd, sbi->ll_md_exp, (void *)arg,
1857 				    sizeof(struct lustre_kernelcomm));
1858 		return rc;
1859 
1860 	case LL_IOC_HSM_COPY_START: {
1861 		struct hsm_copy	*copy;
1862 		int		 rc;
1863 
1864 		copy = kzalloc(sizeof(*copy), GFP_NOFS);
1865 		if (!copy)
1866 			return -ENOMEM;
1867 		if (copy_from_user(copy, (char *)arg, sizeof(*copy))) {
1868 			OBD_FREE_PTR(copy);
1869 			return -EFAULT;
1870 		}
1871 
1872 		rc = ll_ioc_copy_start(inode->i_sb, copy);
1873 		if (copy_to_user((char *)arg, copy, sizeof(*copy)))
1874 			rc = -EFAULT;
1875 
1876 		OBD_FREE_PTR(copy);
1877 		return rc;
1878 	}
1879 	case LL_IOC_HSM_COPY_END: {
1880 		struct hsm_copy	*copy;
1881 		int		 rc;
1882 
1883 		copy = kzalloc(sizeof(*copy), GFP_NOFS);
1884 		if (!copy)
1885 			return -ENOMEM;
1886 		if (copy_from_user(copy, (char *)arg, sizeof(*copy))) {
1887 			OBD_FREE_PTR(copy);
1888 			return -EFAULT;
1889 		}
1890 
1891 		rc = ll_ioc_copy_end(inode->i_sb, copy);
1892 		if (copy_to_user((char *)arg, copy, sizeof(*copy)))
1893 			rc = -EFAULT;
1894 
1895 		OBD_FREE_PTR(copy);
1896 		return rc;
1897 	}
1898 	default:
1899 		return obd_iocontrol(cmd, sbi->ll_dt_exp, 0, NULL, (void *)arg);
1900 	}
1901 }
1902 
ll_dir_seek(struct file * file,loff_t offset,int origin)1903 static loff_t ll_dir_seek(struct file *file, loff_t offset, int origin)
1904 {
1905 	struct inode *inode = file->f_mapping->host;
1906 	struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1907 	struct ll_sb_info *sbi = ll_i2sbi(inode);
1908 	int api32 = ll_need_32bit_api(sbi);
1909 	loff_t ret = -EINVAL;
1910 
1911 	mutex_lock(&inode->i_mutex);
1912 	switch (origin) {
1913 	case SEEK_SET:
1914 		break;
1915 	case SEEK_CUR:
1916 		offset += file->f_pos;
1917 		break;
1918 	case SEEK_END:
1919 		if (offset > 0)
1920 			goto out;
1921 		if (api32)
1922 			offset += LL_DIR_END_OFF_32BIT;
1923 		else
1924 			offset += LL_DIR_END_OFF;
1925 		break;
1926 	default:
1927 		goto out;
1928 	}
1929 
1930 	if (offset >= 0 &&
1931 	    ((api32 && offset <= LL_DIR_END_OFF_32BIT) ||
1932 	     (!api32 && offset <= LL_DIR_END_OFF))) {
1933 		if (offset != file->f_pos) {
1934 			if ((api32 && offset == LL_DIR_END_OFF_32BIT) ||
1935 			    (!api32 && offset == LL_DIR_END_OFF))
1936 				fd->lfd_pos = MDS_DIR_END_OFF;
1937 			else if (api32 && sbi->ll_flags & LL_SBI_64BIT_HASH)
1938 				fd->lfd_pos = offset << 32;
1939 			else
1940 				fd->lfd_pos = offset;
1941 			file->f_pos = offset;
1942 			file->f_version = 0;
1943 		}
1944 		ret = offset;
1945 	}
1946 	goto out;
1947 
1948 out:
1949 	mutex_unlock(&inode->i_mutex);
1950 	return ret;
1951 }
1952 
ll_dir_open(struct inode * inode,struct file * file)1953 static int ll_dir_open(struct inode *inode, struct file *file)
1954 {
1955 	return ll_file_open(inode, file);
1956 }
1957 
ll_dir_release(struct inode * inode,struct file * file)1958 static int ll_dir_release(struct inode *inode, struct file *file)
1959 {
1960 	return ll_file_release(inode, file);
1961 }
1962 
1963 const struct file_operations ll_dir_operations = {
1964 	.llseek   = ll_dir_seek,
1965 	.open     = ll_dir_open,
1966 	.release  = ll_dir_release,
1967 	.read     = generic_read_dir,
1968 	.iterate  = ll_readdir,
1969 	.unlocked_ioctl   = ll_dir_ioctl,
1970 	.fsync    = ll_fsync,
1971 };
1972