1 /*
2  * GPL HEADER START
3  *
4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
5  *
6  * This program is free software; you can redistribute it and/or modify
7  * it under the terms of the GNU General Public License version 2 only,
8  * as published by the Free Software Foundation.
9  *
10  * This program is distributed in the hope that it will be useful, but
11  * WITHOUT ANY WARRANTY; without even the implied warranty of
12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13  * General Public License version 2 for more details (a copy is included
14  * in the LICENSE file that accompanied this code).
15  *
16  * You should have received a copy of the GNU General Public License
17  * version 2 along with this program; If not, see
18  * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
19  *
20  * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
21  * CA 95054 USA or visit www.sun.com if you need additional information or
22  * have any questions.
23  *
24  * GPL HEADER END
25  */
26 /*
27  * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
28  * Use is subject to license terms.
29  *
30  * Copyright (c) 2011, 2012, Intel Corporation.
31  */
32 /*
33  * This file is part of Lustre, http://www.lustre.org/
34  * Lustre is a trademark of Sun Microsystems, Inc.
35  *
36  * lustre/llite/file.c
37  *
38  * Author: Peter Braam <braam@clusterfs.com>
39  * Author: Phil Schwan <phil@clusterfs.com>
40  * Author: Andreas Dilger <adilger@clusterfs.com>
41  */
42 
43 #define DEBUG_SUBSYSTEM S_LLITE
44 #include "../include/lustre_dlm.h"
45 #include "../include/lustre_lite.h"
46 #include <linux/pagemap.h>
47 #include <linux/file.h>
48 #include "llite_internal.h"
49 #include "../include/lustre/ll_fiemap.h"
50 
51 #include "../include/cl_object.h"
52 
53 static int
54 ll_put_grouplock(struct inode *inode, struct file *file, unsigned long arg);
55 
56 static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
57 			  bool *lease_broken);
58 
59 static enum llioc_iter
60 ll_iocontrol_call(struct inode *inode, struct file *file,
61 		  unsigned int cmd, unsigned long arg, int *rcp);
62 
ll_file_data_get(void)63 static struct ll_file_data *ll_file_data_get(void)
64 {
65 	struct ll_file_data *fd;
66 
67 	OBD_SLAB_ALLOC_PTR_GFP(fd, ll_file_data_slab, GFP_NOFS);
68 	if (fd == NULL)
69 		return NULL;
70 	fd->fd_write_failed = false;
71 	return fd;
72 }
73 
ll_file_data_put(struct ll_file_data * fd)74 static void ll_file_data_put(struct ll_file_data *fd)
75 {
76 	if (fd != NULL)
77 		OBD_SLAB_FREE_PTR(fd, ll_file_data_slab);
78 }
79 
ll_pack_inode2opdata(struct inode * inode,struct md_op_data * op_data,struct lustre_handle * fh)80 void ll_pack_inode2opdata(struct inode *inode, struct md_op_data *op_data,
81 			  struct lustre_handle *fh)
82 {
83 	op_data->op_fid1 = ll_i2info(inode)->lli_fid;
84 	op_data->op_attr.ia_mode = inode->i_mode;
85 	op_data->op_attr.ia_atime = inode->i_atime;
86 	op_data->op_attr.ia_mtime = inode->i_mtime;
87 	op_data->op_attr.ia_ctime = inode->i_ctime;
88 	op_data->op_attr.ia_size = i_size_read(inode);
89 	op_data->op_attr_blocks = inode->i_blocks;
90 	((struct ll_iattr *)&op_data->op_attr)->ia_attr_flags =
91 					ll_inode_to_ext_flags(inode->i_flags);
92 	op_data->op_ioepoch = ll_i2info(inode)->lli_ioepoch;
93 	if (fh)
94 		op_data->op_handle = *fh;
95 	op_data->op_capa1 = ll_mdscapa_get(inode);
96 
97 	if (LLIF_DATA_MODIFIED & ll_i2info(inode)->lli_flags)
98 		op_data->op_bias |= MDS_DATA_MODIFIED;
99 }
100 
101 /**
102  * Closes the IO epoch and packs all the attributes into @op_data for
103  * the CLOSE rpc.
104  */
ll_prepare_close(struct inode * inode,struct md_op_data * op_data,struct obd_client_handle * och)105 static void ll_prepare_close(struct inode *inode, struct md_op_data *op_data,
106 			     struct obd_client_handle *och)
107 {
108 	op_data->op_attr.ia_valid = ATTR_MODE | ATTR_ATIME | ATTR_ATIME_SET |
109 					ATTR_MTIME | ATTR_MTIME_SET |
110 					ATTR_CTIME | ATTR_CTIME_SET;
111 
112 	if (!(och->och_flags & FMODE_WRITE))
113 		goto out;
114 
115 	if (!exp_connect_som(ll_i2mdexp(inode)) || !S_ISREG(inode->i_mode))
116 		op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
117 	else
118 		ll_ioepoch_close(inode, op_data, &och, 0);
119 
120 out:
121 	ll_pack_inode2opdata(inode, op_data, &och->och_fh);
122 	ll_prep_md_op_data(op_data, inode, NULL, NULL,
123 			   0, 0, LUSTRE_OPC_ANY, NULL);
124 }
125 
ll_close_inode_openhandle(struct obd_export * md_exp,struct inode * inode,struct obd_client_handle * och,const __u64 * data_version)126 static int ll_close_inode_openhandle(struct obd_export *md_exp,
127 				     struct inode *inode,
128 				     struct obd_client_handle *och,
129 				     const __u64 *data_version)
130 {
131 	struct obd_export *exp = ll_i2mdexp(inode);
132 	struct md_op_data *op_data;
133 	struct ptlrpc_request *req = NULL;
134 	struct obd_device *obd = class_exp2obd(exp);
135 	int epoch_close = 1;
136 	int rc;
137 
138 	if (obd == NULL) {
139 		/*
140 		 * XXX: in case of LMV, is this correct to access
141 		 * ->exp_handle?
142 		 */
143 		CERROR("Invalid MDC connection handle %#llx\n",
144 		       ll_i2mdexp(inode)->exp_handle.h_cookie);
145 		rc = 0;
146 		goto out;
147 	}
148 
149 	op_data = kzalloc(sizeof(*op_data), GFP_NOFS);
150 	if (!op_data) {
151 		/* XXX We leak openhandle and request here. */
152 		rc = -ENOMEM;
153 		goto out;
154 	}
155 
156 	ll_prepare_close(inode, op_data, och);
157 	if (data_version != NULL) {
158 		/* Pass in data_version implies release. */
159 		op_data->op_bias |= MDS_HSM_RELEASE;
160 		op_data->op_data_version = *data_version;
161 		op_data->op_lease_handle = och->och_lease_handle;
162 		op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
163 	}
164 	epoch_close = op_data->op_flags & MF_EPOCH_CLOSE;
165 	rc = md_close(md_exp, op_data, och->och_mod, &req);
166 	if (rc == -EAGAIN) {
167 		/* This close must have the epoch closed. */
168 		LASSERT(epoch_close);
169 		/* MDS has instructed us to obtain Size-on-MDS attribute from
170 		 * OSTs and send setattr to back to MDS. */
171 		rc = ll_som_update(inode, op_data);
172 		if (rc) {
173 			CERROR("inode %lu mdc Size-on-MDS update failed: rc = %d\n",
174 			       inode->i_ino, rc);
175 			rc = 0;
176 		}
177 	} else if (rc) {
178 		CERROR("inode %lu mdc close failed: rc = %d\n",
179 		       inode->i_ino, rc);
180 	}
181 
182 	/* DATA_MODIFIED flag was successfully sent on close, cancel data
183 	 * modification flag. */
184 	if (rc == 0 && (op_data->op_bias & MDS_DATA_MODIFIED)) {
185 		struct ll_inode_info *lli = ll_i2info(inode);
186 
187 		spin_lock(&lli->lli_lock);
188 		lli->lli_flags &= ~LLIF_DATA_MODIFIED;
189 		spin_unlock(&lli->lli_lock);
190 	}
191 
192 	if (rc == 0) {
193 		rc = ll_objects_destroy(req, inode);
194 		if (rc)
195 			CERROR("inode %lu ll_objects destroy: rc = %d\n",
196 			       inode->i_ino, rc);
197 	}
198 	if (rc == 0 && op_data->op_bias & MDS_HSM_RELEASE) {
199 		struct mdt_body *body;
200 
201 		body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
202 		if (!(body->valid & OBD_MD_FLRELEASED))
203 			rc = -EBUSY;
204 	}
205 
206 	ll_finish_md_op_data(op_data);
207 
208 out:
209 	if (exp_connect_som(exp) && !epoch_close &&
210 	    S_ISREG(inode->i_mode) && (och->och_flags & FMODE_WRITE)) {
211 		ll_queue_done_writing(inode, LLIF_DONE_WRITING);
212 	} else {
213 		md_clear_open_replay_data(md_exp, och);
214 		/* Free @och if it is not waiting for DONE_WRITING. */
215 		och->och_fh.cookie = DEAD_HANDLE_MAGIC;
216 		OBD_FREE_PTR(och);
217 	}
218 	if (req) /* This is close request */
219 		ptlrpc_req_finished(req);
220 	return rc;
221 }
222 
ll_md_real_close(struct inode * inode,fmode_t fmode)223 int ll_md_real_close(struct inode *inode, fmode_t fmode)
224 {
225 	struct ll_inode_info *lli = ll_i2info(inode);
226 	struct obd_client_handle **och_p;
227 	struct obd_client_handle *och;
228 	__u64 *och_usecount;
229 	int rc = 0;
230 
231 	if (fmode & FMODE_WRITE) {
232 		och_p = &lli->lli_mds_write_och;
233 		och_usecount = &lli->lli_open_fd_write_count;
234 	} else if (fmode & FMODE_EXEC) {
235 		och_p = &lli->lli_mds_exec_och;
236 		och_usecount = &lli->lli_open_fd_exec_count;
237 	} else {
238 		LASSERT(fmode & FMODE_READ);
239 		och_p = &lli->lli_mds_read_och;
240 		och_usecount = &lli->lli_open_fd_read_count;
241 	}
242 
243 	mutex_lock(&lli->lli_och_mutex);
244 	if (*och_usecount > 0) {
245 		/* There are still users of this handle, so skip
246 		 * freeing it. */
247 		mutex_unlock(&lli->lli_och_mutex);
248 		return 0;
249 	}
250 
251 	och = *och_p;
252 	*och_p = NULL;
253 	mutex_unlock(&lli->lli_och_mutex);
254 
255 	if (och != NULL) {
256 		/* There might be a race and this handle may already
257 		   be closed. */
258 		rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp,
259 					       inode, och, NULL);
260 	}
261 
262 	return rc;
263 }
264 
ll_md_close(struct obd_export * md_exp,struct inode * inode,struct file * file)265 static int ll_md_close(struct obd_export *md_exp, struct inode *inode,
266 		       struct file *file)
267 {
268 	struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
269 	struct ll_inode_info *lli = ll_i2info(inode);
270 	int lockmode;
271 	__u64 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_TEST_LOCK;
272 	struct lustre_handle lockh;
273 	ldlm_policy_data_t policy = {.l_inodebits = {MDS_INODELOCK_OPEN}};
274 	int rc = 0;
275 
276 	/* clear group lock, if present */
277 	if (unlikely(fd->fd_flags & LL_FILE_GROUP_LOCKED))
278 		ll_put_grouplock(inode, file, fd->fd_grouplock.cg_gid);
279 
280 	if (fd->fd_lease_och != NULL) {
281 		bool lease_broken;
282 
283 		/* Usually the lease is not released when the
284 		 * application crashed, we need to release here. */
285 		rc = ll_lease_close(fd->fd_lease_och, inode, &lease_broken);
286 		CDEBUG(rc ? D_ERROR : D_INODE, "Clean up lease "DFID" %d/%d\n",
287 			PFID(&lli->lli_fid), rc, lease_broken);
288 
289 		fd->fd_lease_och = NULL;
290 	}
291 
292 	if (fd->fd_och != NULL) {
293 		rc = ll_close_inode_openhandle(md_exp, inode, fd->fd_och, NULL);
294 		fd->fd_och = NULL;
295 		goto out;
296 	}
297 
298 	/* Let's see if we have good enough OPEN lock on the file and if
299 	   we can skip talking to MDS */
300 
301 	mutex_lock(&lli->lli_och_mutex);
302 	if (fd->fd_omode & FMODE_WRITE) {
303 		lockmode = LCK_CW;
304 		LASSERT(lli->lli_open_fd_write_count);
305 		lli->lli_open_fd_write_count--;
306 	} else if (fd->fd_omode & FMODE_EXEC) {
307 		lockmode = LCK_PR;
308 		LASSERT(lli->lli_open_fd_exec_count);
309 		lli->lli_open_fd_exec_count--;
310 	} else {
311 		lockmode = LCK_CR;
312 		LASSERT(lli->lli_open_fd_read_count);
313 		lli->lli_open_fd_read_count--;
314 	}
315 	mutex_unlock(&lli->lli_och_mutex);
316 
317 	if (!md_lock_match(md_exp, flags, ll_inode2fid(inode),
318 			   LDLM_IBITS, &policy, lockmode, &lockh))
319 		rc = ll_md_real_close(inode, fd->fd_omode);
320 
321 out:
322 	LUSTRE_FPRIVATE(file) = NULL;
323 	ll_file_data_put(fd);
324 	ll_capa_close(inode);
325 
326 	return rc;
327 }
328 
329 /* While this returns an error code, fput() the caller does not, so we need
330  * to make every effort to clean up all of our state here.  Also, applications
331  * rarely check close errors and even if an error is returned they will not
332  * re-try the close call.
333  */
ll_file_release(struct inode * inode,struct file * file)334 int ll_file_release(struct inode *inode, struct file *file)
335 {
336 	struct ll_file_data *fd;
337 	struct ll_sb_info *sbi = ll_i2sbi(inode);
338 	struct ll_inode_info *lli = ll_i2info(inode);
339 	int rc;
340 
341 	CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n", inode->i_ino,
342 	       inode->i_generation, inode);
343 
344 #ifdef CONFIG_FS_POSIX_ACL
345 	if (sbi->ll_flags & LL_SBI_RMT_CLIENT && is_root_inode(inode)) {
346 		struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
347 
348 		LASSERT(fd != NULL);
349 		if (unlikely(fd->fd_flags & LL_FILE_RMTACL)) {
350 			fd->fd_flags &= ~LL_FILE_RMTACL;
351 			rct_del(&sbi->ll_rct, current_pid());
352 			et_search_free(&sbi->ll_et, current_pid());
353 		}
354 	}
355 #endif
356 
357 	if (!is_root_inode(inode))
358 		ll_stats_ops_tally(sbi, LPROC_LL_RELEASE, 1);
359 	fd = LUSTRE_FPRIVATE(file);
360 	LASSERT(fd != NULL);
361 
362 	/* The last ref on @file, maybe not the owner pid of statahead.
363 	 * Different processes can open the same dir, "ll_opendir_key" means:
364 	 * it is me that should stop the statahead thread. */
365 	if (S_ISDIR(inode->i_mode) && lli->lli_opendir_key == fd &&
366 	    lli->lli_opendir_pid != 0)
367 		ll_stop_statahead(inode, lli->lli_opendir_key);
368 
369 	if (is_root_inode(inode)) {
370 		LUSTRE_FPRIVATE(file) = NULL;
371 		ll_file_data_put(fd);
372 		return 0;
373 	}
374 
375 	if (!S_ISDIR(inode->i_mode)) {
376 		lov_read_and_clear_async_rc(lli->lli_clob);
377 		lli->lli_async_rc = 0;
378 	}
379 
380 	rc = ll_md_close(sbi->ll_md_exp, inode, file);
381 
382 	if (CFS_FAIL_TIMEOUT_MS(OBD_FAIL_PTLRPC_DUMP_LOG, cfs_fail_val))
383 		libcfs_debug_dumplog();
384 
385 	return rc;
386 }
387 
ll_intent_file_open(struct dentry * dentry,void * lmm,int lmmsize,struct lookup_intent * itp)388 static int ll_intent_file_open(struct dentry *dentry, void *lmm,
389 			       int lmmsize, struct lookup_intent *itp)
390 {
391 	struct inode *inode = d_inode(dentry);
392 	struct ll_sb_info *sbi = ll_i2sbi(inode);
393 	struct dentry *parent = dentry->d_parent;
394 	const char *name = dentry->d_name.name;
395 	const int len = dentry->d_name.len;
396 	struct md_op_data *op_data;
397 	struct ptlrpc_request *req;
398 	__u32 opc = LUSTRE_OPC_ANY;
399 	int rc;
400 
401 	/* Usually we come here only for NFSD, and we want open lock.
402 	   But we can also get here with pre 2.6.15 patchless kernels, and in
403 	   that case that lock is also ok */
404 	/* We can also get here if there was cached open handle in revalidate_it
405 	 * but it disappeared while we were getting from there to ll_file_open.
406 	 * But this means this file was closed and immediately opened which
407 	 * makes a good candidate for using OPEN lock */
408 	/* If lmmsize & lmm are not 0, we are just setting stripe info
409 	 * parameters. No need for the open lock */
410 	if (lmm == NULL && lmmsize == 0) {
411 		itp->it_flags |= MDS_OPEN_LOCK;
412 		if (itp->it_flags & FMODE_WRITE)
413 			opc = LUSTRE_OPC_CREATE;
414 	}
415 
416 	op_data  = ll_prep_md_op_data(NULL, d_inode(parent),
417 				      inode, name, len,
418 				      O_RDWR, opc, NULL);
419 	if (IS_ERR(op_data))
420 		return PTR_ERR(op_data);
421 
422 	itp->it_flags |= MDS_OPEN_BY_FID;
423 	rc = md_intent_lock(sbi->ll_md_exp, op_data, lmm, lmmsize, itp,
424 			    0 /*unused */, &req, ll_md_blocking_ast, 0);
425 	ll_finish_md_op_data(op_data);
426 	if (rc == -ESTALE) {
427 		/* reason for keep own exit path - don`t flood log
428 		* with messages with -ESTALE errors.
429 		*/
430 		if (!it_disposition(itp, DISP_OPEN_OPEN) ||
431 		     it_open_error(DISP_OPEN_OPEN, itp))
432 			goto out;
433 		ll_release_openhandle(inode, itp);
434 		goto out;
435 	}
436 
437 	if (it_disposition(itp, DISP_LOOKUP_NEG)) {
438 		rc = -ENOENT;
439 		goto out;
440 	}
441 
442 	if (rc != 0 || it_open_error(DISP_OPEN_OPEN, itp)) {
443 		rc = rc ? rc : it_open_error(DISP_OPEN_OPEN, itp);
444 		CDEBUG(D_VFSTRACE, "lock enqueue: err: %d\n", rc);
445 		goto out;
446 	}
447 
448 	rc = ll_prep_inode(&inode, req, NULL, itp);
449 	if (!rc && itp->d.lustre.it_lock_mode)
450 		ll_set_lock_data(sbi->ll_md_exp, inode, itp, NULL);
451 
452 out:
453 	ptlrpc_req_finished(req);
454 	ll_intent_drop_lock(itp);
455 
456 	return rc;
457 }
458 
459 /**
460  * Assign an obtained @ioepoch to client's inode. No lock is needed, MDS does
461  * not believe attributes if a few ioepoch holders exist. Attributes for
462  * previous ioepoch if new one is opened are also skipped by MDS.
463  */
ll_ioepoch_open(struct ll_inode_info * lli,__u64 ioepoch)464 void ll_ioepoch_open(struct ll_inode_info *lli, __u64 ioepoch)
465 {
466 	if (ioepoch && lli->lli_ioepoch != ioepoch) {
467 		lli->lli_ioepoch = ioepoch;
468 		CDEBUG(D_INODE, "Epoch %llu opened on "DFID"\n",
469 		       ioepoch, PFID(&lli->lli_fid));
470 	}
471 }
472 
ll_och_fill(struct obd_export * md_exp,struct lookup_intent * it,struct obd_client_handle * och)473 static int ll_och_fill(struct obd_export *md_exp, struct lookup_intent *it,
474 		       struct obd_client_handle *och)
475 {
476 	struct ptlrpc_request *req = it->d.lustre.it_data;
477 	struct mdt_body *body;
478 
479 	body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
480 	och->och_fh = body->handle;
481 	och->och_fid = body->fid1;
482 	och->och_lease_handle.cookie = it->d.lustre.it_lock_handle;
483 	och->och_magic = OBD_CLIENT_HANDLE_MAGIC;
484 	och->och_flags = it->it_flags;
485 
486 	return md_set_open_replay_data(md_exp, och, it);
487 }
488 
ll_local_open(struct file * file,struct lookup_intent * it,struct ll_file_data * fd,struct obd_client_handle * och)489 static int ll_local_open(struct file *file, struct lookup_intent *it,
490 			 struct ll_file_data *fd, struct obd_client_handle *och)
491 {
492 	struct inode *inode = file_inode(file);
493 	struct ll_inode_info *lli = ll_i2info(inode);
494 
495 	LASSERT(!LUSTRE_FPRIVATE(file));
496 
497 	LASSERT(fd != NULL);
498 
499 	if (och) {
500 		struct ptlrpc_request *req = it->d.lustre.it_data;
501 		struct mdt_body *body;
502 		int rc;
503 
504 		rc = ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
505 		if (rc != 0)
506 			return rc;
507 
508 		body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
509 		ll_ioepoch_open(lli, body->ioepoch);
510 	}
511 
512 	LUSTRE_FPRIVATE(file) = fd;
513 	ll_readahead_init(inode, &fd->fd_ras);
514 	fd->fd_omode = it->it_flags & (FMODE_READ | FMODE_WRITE | FMODE_EXEC);
515 	return 0;
516 }
517 
518 /* Open a file, and (for the very first open) create objects on the OSTs at
519  * this time.  If opened with O_LOV_DELAY_CREATE, then we don't do the object
520  * creation or open until ll_lov_setstripe() ioctl is called.
521  *
522  * If we already have the stripe MD locally then we don't request it in
523  * md_open(), by passing a lmm_size = 0.
524  *
525  * It is up to the application to ensure no other processes open this file
526  * in the O_LOV_DELAY_CREATE case, or the default striping pattern will be
527  * used.  We might be able to avoid races of that sort by getting lli_open_sem
528  * before returning in the O_LOV_DELAY_CREATE case and dropping it here
529  * or in ll_file_release(), but I'm not sure that is desirable/necessary.
530  */
ll_file_open(struct inode * inode,struct file * file)531 int ll_file_open(struct inode *inode, struct file *file)
532 {
533 	struct ll_inode_info *lli = ll_i2info(inode);
534 	struct lookup_intent *it, oit = { .it_op = IT_OPEN,
535 					  .it_flags = file->f_flags };
536 	struct obd_client_handle **och_p = NULL;
537 	__u64 *och_usecount = NULL;
538 	struct ll_file_data *fd;
539 	int rc = 0, opendir_set = 0;
540 
541 	CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), flags %o\n", inode->i_ino,
542 	       inode->i_generation, inode, file->f_flags);
543 
544 	it = file->private_data; /* XXX: compat macro */
545 	file->private_data = NULL; /* prevent ll_local_open assertion */
546 
547 	fd = ll_file_data_get();
548 	if (fd == NULL) {
549 		rc = -ENOMEM;
550 		goto out_openerr;
551 	}
552 
553 	fd->fd_file = file;
554 	if (S_ISDIR(inode->i_mode)) {
555 		spin_lock(&lli->lli_sa_lock);
556 		if (lli->lli_opendir_key == NULL && lli->lli_sai == NULL &&
557 		    lli->lli_opendir_pid == 0) {
558 			lli->lli_opendir_key = fd;
559 			lli->lli_opendir_pid = current_pid();
560 			opendir_set = 1;
561 		}
562 		spin_unlock(&lli->lli_sa_lock);
563 	}
564 
565 	if (is_root_inode(inode)) {
566 		LUSTRE_FPRIVATE(file) = fd;
567 		return 0;
568 	}
569 
570 	if (!it || !it->d.lustre.it_disposition) {
571 		/* Convert f_flags into access mode. We cannot use file->f_mode,
572 		 * because everything but O_ACCMODE mask was stripped from
573 		 * there */
574 		if ((oit.it_flags + 1) & O_ACCMODE)
575 			oit.it_flags++;
576 		if (file->f_flags & O_TRUNC)
577 			oit.it_flags |= FMODE_WRITE;
578 
579 		/* kernel only call f_op->open in dentry_open.  filp_open calls
580 		 * dentry_open after call to open_namei that checks permissions.
581 		 * Only nfsd_open call dentry_open directly without checking
582 		 * permissions and because of that this code below is safe. */
583 		if (oit.it_flags & (FMODE_WRITE | FMODE_READ))
584 			oit.it_flags |= MDS_OPEN_OWNEROVERRIDE;
585 
586 		/* We do not want O_EXCL here, presumably we opened the file
587 		 * already? XXX - NFS implications? */
588 		oit.it_flags &= ~O_EXCL;
589 
590 		/* bug20584, if "it_flags" contains O_CREAT, the file will be
591 		 * created if necessary, then "IT_CREAT" should be set to keep
592 		 * consistent with it */
593 		if (oit.it_flags & O_CREAT)
594 			oit.it_op |= IT_CREAT;
595 
596 		it = &oit;
597 	}
598 
599 restart:
600 	/* Let's see if we have file open on MDS already. */
601 	if (it->it_flags & FMODE_WRITE) {
602 		och_p = &lli->lli_mds_write_och;
603 		och_usecount = &lli->lli_open_fd_write_count;
604 	} else if (it->it_flags & FMODE_EXEC) {
605 		och_p = &lli->lli_mds_exec_och;
606 		och_usecount = &lli->lli_open_fd_exec_count;
607 	 } else {
608 		och_p = &lli->lli_mds_read_och;
609 		och_usecount = &lli->lli_open_fd_read_count;
610 	}
611 
612 	mutex_lock(&lli->lli_och_mutex);
613 	if (*och_p) { /* Open handle is present */
614 		if (it_disposition(it, DISP_OPEN_OPEN)) {
615 			/* Well, there's extra open request that we do not need,
616 			   let's close it somehow. This will decref request. */
617 			rc = it_open_error(DISP_OPEN_OPEN, it);
618 			if (rc) {
619 				mutex_unlock(&lli->lli_och_mutex);
620 				goto out_openerr;
621 			}
622 
623 			ll_release_openhandle(inode, it);
624 		}
625 		(*och_usecount)++;
626 
627 		rc = ll_local_open(file, it, fd, NULL);
628 		if (rc) {
629 			(*och_usecount)--;
630 			mutex_unlock(&lli->lli_och_mutex);
631 			goto out_openerr;
632 		}
633 	} else {
634 		LASSERT(*och_usecount == 0);
635 		if (!it->d.lustre.it_disposition) {
636 			/* We cannot just request lock handle now, new ELC code
637 			   means that one of other OPEN locks for this file
638 			   could be cancelled, and since blocking ast handler
639 			   would attempt to grab och_mutex as well, that would
640 			   result in a deadlock */
641 			mutex_unlock(&lli->lli_och_mutex);
642 			it->it_create_mode |= M_CHECK_STALE;
643 			rc = ll_intent_file_open(file->f_path.dentry, NULL, 0, it);
644 			it->it_create_mode &= ~M_CHECK_STALE;
645 			if (rc)
646 				goto out_openerr;
647 
648 			goto restart;
649 		}
650 		*och_p = kzalloc(sizeof(struct obd_client_handle), GFP_NOFS);
651 		if (!*och_p) {
652 			rc = -ENOMEM;
653 			goto out_och_free;
654 		}
655 
656 		(*och_usecount)++;
657 
658 		/* md_intent_lock() didn't get a request ref if there was an
659 		 * open error, so don't do cleanup on the request here
660 		 * (bug 3430) */
661 		/* XXX (green): Should not we bail out on any error here, not
662 		 * just open error? */
663 		rc = it_open_error(DISP_OPEN_OPEN, it);
664 		if (rc)
665 			goto out_och_free;
666 
667 		LASSERT(it_disposition(it, DISP_ENQ_OPEN_REF));
668 
669 		rc = ll_local_open(file, it, fd, *och_p);
670 		if (rc)
671 			goto out_och_free;
672 	}
673 	mutex_unlock(&lli->lli_och_mutex);
674 	fd = NULL;
675 
676 	/* Must do this outside lli_och_mutex lock to prevent deadlock where
677 	   different kind of OPEN lock for this same inode gets cancelled
678 	   by ldlm_cancel_lru */
679 	if (!S_ISREG(inode->i_mode))
680 		goto out_och_free;
681 
682 	ll_capa_open(inode);
683 
684 	if (!lli->lli_has_smd &&
685 	    (cl_is_lov_delay_create(file->f_flags) ||
686 	     (file->f_mode & FMODE_WRITE) == 0)) {
687 		CDEBUG(D_INODE, "object creation was delayed\n");
688 		goto out_och_free;
689 	}
690 	cl_lov_delay_create_clear(&file->f_flags);
691 	goto out_och_free;
692 
693 out_och_free:
694 	if (rc) {
695 		if (och_p && *och_p) {
696 			OBD_FREE(*och_p, sizeof(struct obd_client_handle));
697 			*och_p = NULL; /* OBD_FREE writes some magic there */
698 			(*och_usecount)--;
699 		}
700 		mutex_unlock(&lli->lli_och_mutex);
701 
702 out_openerr:
703 		if (opendir_set != 0)
704 			ll_stop_statahead(inode, lli->lli_opendir_key);
705 		if (fd != NULL)
706 			ll_file_data_put(fd);
707 	} else {
708 		ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_OPEN, 1);
709 	}
710 
711 	if (it && it_disposition(it, DISP_ENQ_OPEN_REF)) {
712 		ptlrpc_req_finished(it->d.lustre.it_data);
713 		it_clear_disposition(it, DISP_ENQ_OPEN_REF);
714 	}
715 
716 	return rc;
717 }
718 
ll_md_blocking_lease_ast(struct ldlm_lock * lock,struct ldlm_lock_desc * desc,void * data,int flag)719 static int ll_md_blocking_lease_ast(struct ldlm_lock *lock,
720 			struct ldlm_lock_desc *desc, void *data, int flag)
721 {
722 	int rc;
723 	struct lustre_handle lockh;
724 
725 	switch (flag) {
726 	case LDLM_CB_BLOCKING:
727 		ldlm_lock2handle(lock, &lockh);
728 		rc = ldlm_cli_cancel(&lockh, LCF_ASYNC);
729 		if (rc < 0) {
730 			CDEBUG(D_INODE, "ldlm_cli_cancel: %d\n", rc);
731 			return rc;
732 		}
733 		break;
734 	case LDLM_CB_CANCELING:
735 		/* do nothing */
736 		break;
737 	}
738 	return 0;
739 }
740 
741 /**
742  * Acquire a lease and open the file.
743  */
744 static struct obd_client_handle *
ll_lease_open(struct inode * inode,struct file * file,fmode_t fmode,__u64 open_flags)745 ll_lease_open(struct inode *inode, struct file *file, fmode_t fmode,
746 	      __u64 open_flags)
747 {
748 	struct lookup_intent it = { .it_op = IT_OPEN };
749 	struct ll_sb_info *sbi = ll_i2sbi(inode);
750 	struct md_op_data *op_data;
751 	struct ptlrpc_request *req;
752 	struct lustre_handle old_handle = { 0 };
753 	struct obd_client_handle *och = NULL;
754 	int rc;
755 	int rc2;
756 
757 	if (fmode != FMODE_WRITE && fmode != FMODE_READ)
758 		return ERR_PTR(-EINVAL);
759 
760 	if (file != NULL) {
761 		struct ll_inode_info *lli = ll_i2info(inode);
762 		struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
763 		struct obd_client_handle **och_p;
764 		__u64 *och_usecount;
765 
766 		if (!(fmode & file->f_mode) || (file->f_mode & FMODE_EXEC))
767 			return ERR_PTR(-EPERM);
768 
769 		/* Get the openhandle of the file */
770 		rc = -EBUSY;
771 		mutex_lock(&lli->lli_och_mutex);
772 		if (fd->fd_lease_och != NULL) {
773 			mutex_unlock(&lli->lli_och_mutex);
774 			return ERR_PTR(rc);
775 		}
776 
777 		if (fd->fd_och == NULL) {
778 			if (file->f_mode & FMODE_WRITE) {
779 				LASSERT(lli->lli_mds_write_och != NULL);
780 				och_p = &lli->lli_mds_write_och;
781 				och_usecount = &lli->lli_open_fd_write_count;
782 			} else {
783 				LASSERT(lli->lli_mds_read_och != NULL);
784 				och_p = &lli->lli_mds_read_och;
785 				och_usecount = &lli->lli_open_fd_read_count;
786 			}
787 			if (*och_usecount == 1) {
788 				fd->fd_och = *och_p;
789 				*och_p = NULL;
790 				*och_usecount = 0;
791 				rc = 0;
792 			}
793 		}
794 		mutex_unlock(&lli->lli_och_mutex);
795 		if (rc < 0) /* more than 1 opener */
796 			return ERR_PTR(rc);
797 
798 		LASSERT(fd->fd_och != NULL);
799 		old_handle = fd->fd_och->och_fh;
800 	}
801 
802 	och = kzalloc(sizeof(*och), GFP_NOFS);
803 	if (!och)
804 		return ERR_PTR(-ENOMEM);
805 
806 	op_data = ll_prep_md_op_data(NULL, inode, inode, NULL, 0, 0,
807 					LUSTRE_OPC_ANY, NULL);
808 	if (IS_ERR(op_data)) {
809 		rc = PTR_ERR(op_data);
810 		goto out;
811 	}
812 
813 	/* To tell the MDT this openhandle is from the same owner */
814 	op_data->op_handle = old_handle;
815 
816 	it.it_flags = fmode | open_flags;
817 	it.it_flags |= MDS_OPEN_LOCK | MDS_OPEN_BY_FID | MDS_OPEN_LEASE;
818 	rc = md_intent_lock(sbi->ll_md_exp, op_data, NULL, 0, &it, 0, &req,
819 				ll_md_blocking_lease_ast,
820 	/* LDLM_FL_NO_LRU: To not put the lease lock into LRU list, otherwise
821 	 * it can be cancelled which may mislead applications that the lease is
822 	 * broken;
823 	 * LDLM_FL_EXCL: Set this flag so that it won't be matched by normal
824 	 * open in ll_md_blocking_ast(). Otherwise as ll_md_blocking_lease_ast
825 	 * doesn't deal with openhandle, so normal openhandle will be leaked. */
826 				LDLM_FL_NO_LRU | LDLM_FL_EXCL);
827 	ll_finish_md_op_data(op_data);
828 	ptlrpc_req_finished(req);
829 	if (rc < 0)
830 		goto out_release_it;
831 
832 	if (it_disposition(&it, DISP_LOOKUP_NEG)) {
833 		rc = -ENOENT;
834 		goto out_release_it;
835 	}
836 
837 	rc = it_open_error(DISP_OPEN_OPEN, &it);
838 	if (rc)
839 		goto out_release_it;
840 
841 	LASSERT(it_disposition(&it, DISP_ENQ_OPEN_REF));
842 	ll_och_fill(sbi->ll_md_exp, &it, och);
843 
844 	if (!it_disposition(&it, DISP_OPEN_LEASE)) /* old server? */ {
845 		rc = -EOPNOTSUPP;
846 		goto out_close;
847 	}
848 
849 	/* already get lease, handle lease lock */
850 	ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
851 	if (it.d.lustre.it_lock_mode == 0 ||
852 	    it.d.lustre.it_lock_bits != MDS_INODELOCK_OPEN) {
853 		/* open lock must return for lease */
854 		CERROR(DFID "lease granted but no open lock, %d/%llu.\n",
855 			PFID(ll_inode2fid(inode)), it.d.lustre.it_lock_mode,
856 			it.d.lustre.it_lock_bits);
857 		rc = -EPROTO;
858 		goto out_close;
859 	}
860 
861 	ll_intent_release(&it);
862 	return och;
863 
864 out_close:
865 	rc2 = ll_close_inode_openhandle(sbi->ll_md_exp, inode, och, NULL);
866 	if (rc2)
867 		CERROR("Close openhandle returned %d\n", rc2);
868 
869 	/* cancel open lock */
870 	if (it.d.lustre.it_lock_mode != 0) {
871 		ldlm_lock_decref_and_cancel(&och->och_lease_handle,
872 						it.d.lustre.it_lock_mode);
873 		it.d.lustre.it_lock_mode = 0;
874 	}
875 out_release_it:
876 	ll_intent_release(&it);
877 out:
878 	OBD_FREE_PTR(och);
879 	return ERR_PTR(rc);
880 }
881 
882 /**
883  * Release lease and close the file.
884  * It will check if the lease has ever broken.
885  */
ll_lease_close(struct obd_client_handle * och,struct inode * inode,bool * lease_broken)886 static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
887 			  bool *lease_broken)
888 {
889 	struct ldlm_lock *lock;
890 	bool cancelled = true;
891 	int rc;
892 
893 	lock = ldlm_handle2lock(&och->och_lease_handle);
894 	if (lock != NULL) {
895 		lock_res_and_lock(lock);
896 		cancelled = ldlm_is_cancel(lock);
897 		unlock_res_and_lock(lock);
898 		ldlm_lock_put(lock);
899 	}
900 
901 	CDEBUG(D_INODE, "lease for "DFID" broken? %d\n",
902 		PFID(&ll_i2info(inode)->lli_fid), cancelled);
903 
904 	if (!cancelled)
905 		ldlm_cli_cancel(&och->och_lease_handle, 0);
906 	if (lease_broken != NULL)
907 		*lease_broken = cancelled;
908 
909 	rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp, inode, och,
910 				       NULL);
911 	return rc;
912 }
913 
914 /* Fills the obdo with the attributes for the lsm */
ll_lsm_getattr(struct lov_stripe_md * lsm,struct obd_export * exp,struct obd_capa * capa,struct obdo * obdo,__u64 ioepoch,int sync)915 static int ll_lsm_getattr(struct lov_stripe_md *lsm, struct obd_export *exp,
916 			  struct obd_capa *capa, struct obdo *obdo,
917 			  __u64 ioepoch, int sync)
918 {
919 	struct ptlrpc_request_set *set;
920 	struct obd_info	    oinfo = { { { 0 } } };
921 	int			rc;
922 
923 	LASSERT(lsm != NULL);
924 
925 	oinfo.oi_md = lsm;
926 	oinfo.oi_oa = obdo;
927 	oinfo.oi_oa->o_oi = lsm->lsm_oi;
928 	oinfo.oi_oa->o_mode = S_IFREG;
929 	oinfo.oi_oa->o_ioepoch = ioepoch;
930 	oinfo.oi_oa->o_valid = OBD_MD_FLID | OBD_MD_FLTYPE |
931 			       OBD_MD_FLSIZE | OBD_MD_FLBLOCKS |
932 			       OBD_MD_FLBLKSZ | OBD_MD_FLATIME |
933 			       OBD_MD_FLMTIME | OBD_MD_FLCTIME |
934 			       OBD_MD_FLGROUP | OBD_MD_FLEPOCH |
935 			       OBD_MD_FLDATAVERSION;
936 	oinfo.oi_capa = capa;
937 	if (sync) {
938 		oinfo.oi_oa->o_valid |= OBD_MD_FLFLAGS;
939 		oinfo.oi_oa->o_flags |= OBD_FL_SRVLOCK;
940 	}
941 
942 	set = ptlrpc_prep_set();
943 	if (set == NULL) {
944 		CERROR("can't allocate ptlrpc set\n");
945 		rc = -ENOMEM;
946 	} else {
947 		rc = obd_getattr_async(exp, &oinfo, set);
948 		if (rc == 0)
949 			rc = ptlrpc_set_wait(set);
950 		ptlrpc_set_destroy(set);
951 	}
952 	if (rc == 0)
953 		oinfo.oi_oa->o_valid &= (OBD_MD_FLBLOCKS | OBD_MD_FLBLKSZ |
954 					 OBD_MD_FLATIME | OBD_MD_FLMTIME |
955 					 OBD_MD_FLCTIME | OBD_MD_FLSIZE |
956 					 OBD_MD_FLDATAVERSION);
957 	return rc;
958 }
959 
960 /**
961   * Performs the getattr on the inode and updates its fields.
962   * If @sync != 0, perform the getattr under the server-side lock.
963   */
ll_inode_getattr(struct inode * inode,struct obdo * obdo,__u64 ioepoch,int sync)964 int ll_inode_getattr(struct inode *inode, struct obdo *obdo,
965 		     __u64 ioepoch, int sync)
966 {
967 	struct obd_capa      *capa = ll_mdscapa_get(inode);
968 	struct lov_stripe_md *lsm;
969 	int rc;
970 
971 	lsm = ccc_inode_lsm_get(inode);
972 	rc = ll_lsm_getattr(lsm, ll_i2dtexp(inode),
973 			    capa, obdo, ioepoch, sync);
974 	capa_put(capa);
975 	if (rc == 0) {
976 		struct ost_id *oi = lsm ? &lsm->lsm_oi : &obdo->o_oi;
977 
978 		obdo_refresh_inode(inode, obdo, obdo->o_valid);
979 		CDEBUG(D_INODE, "objid " DOSTID " size %llu, blocks %llu, blksize %lu\n",
980 		       POSTID(oi), i_size_read(inode),
981 		       (unsigned long long)inode->i_blocks,
982 		       1UL << inode->i_blkbits);
983 	}
984 	ccc_inode_lsm_put(inode, lsm);
985 	return rc;
986 }
987 
ll_merge_lvb(const struct lu_env * env,struct inode * inode)988 int ll_merge_lvb(const struct lu_env *env, struct inode *inode)
989 {
990 	struct ll_inode_info *lli = ll_i2info(inode);
991 	struct cl_object *obj = lli->lli_clob;
992 	struct cl_attr *attr = ccc_env_thread_attr(env);
993 	struct ost_lvb lvb;
994 	int rc = 0;
995 
996 	ll_inode_size_lock(inode);
997 	/* merge timestamps the most recently obtained from mds with
998 	   timestamps obtained from osts */
999 	LTIME_S(inode->i_atime) = lli->lli_lvb.lvb_atime;
1000 	LTIME_S(inode->i_mtime) = lli->lli_lvb.lvb_mtime;
1001 	LTIME_S(inode->i_ctime) = lli->lli_lvb.lvb_ctime;
1002 
1003 	lvb.lvb_size = i_size_read(inode);
1004 	lvb.lvb_blocks = inode->i_blocks;
1005 	lvb.lvb_mtime = LTIME_S(inode->i_mtime);
1006 	lvb.lvb_atime = LTIME_S(inode->i_atime);
1007 	lvb.lvb_ctime = LTIME_S(inode->i_ctime);
1008 
1009 	cl_object_attr_lock(obj);
1010 	rc = cl_object_attr_get(env, obj, attr);
1011 	cl_object_attr_unlock(obj);
1012 
1013 	if (rc == 0) {
1014 		if (lvb.lvb_atime < attr->cat_atime)
1015 			lvb.lvb_atime = attr->cat_atime;
1016 		if (lvb.lvb_ctime < attr->cat_ctime)
1017 			lvb.lvb_ctime = attr->cat_ctime;
1018 		if (lvb.lvb_mtime < attr->cat_mtime)
1019 			lvb.lvb_mtime = attr->cat_mtime;
1020 
1021 		CDEBUG(D_VFSTRACE, DFID" updating i_size %llu\n",
1022 				PFID(&lli->lli_fid), attr->cat_size);
1023 		cl_isize_write_nolock(inode, attr->cat_size);
1024 
1025 		inode->i_blocks = attr->cat_blocks;
1026 
1027 		LTIME_S(inode->i_mtime) = lvb.lvb_mtime;
1028 		LTIME_S(inode->i_atime) = lvb.lvb_atime;
1029 		LTIME_S(inode->i_ctime) = lvb.lvb_ctime;
1030 	}
1031 	ll_inode_size_unlock(inode);
1032 
1033 	return rc;
1034 }
1035 
ll_glimpse_ioctl(struct ll_sb_info * sbi,struct lov_stripe_md * lsm,lstat_t * st)1036 int ll_glimpse_ioctl(struct ll_sb_info *sbi, struct lov_stripe_md *lsm,
1037 		     lstat_t *st)
1038 {
1039 	struct obdo obdo = { 0 };
1040 	int rc;
1041 
1042 	rc = ll_lsm_getattr(lsm, sbi->ll_dt_exp, NULL, &obdo, 0, 0);
1043 	if (rc == 0) {
1044 		st->st_size   = obdo.o_size;
1045 		st->st_blocks = obdo.o_blocks;
1046 		st->st_mtime  = obdo.o_mtime;
1047 		st->st_atime  = obdo.o_atime;
1048 		st->st_ctime  = obdo.o_ctime;
1049 	}
1050 	return rc;
1051 }
1052 
file_is_noatime(const struct file * file)1053 static bool file_is_noatime(const struct file *file)
1054 {
1055 	const struct vfsmount *mnt = file->f_path.mnt;
1056 	const struct inode *inode = file_inode(file);
1057 
1058 	/* Adapted from file_accessed() and touch_atime().*/
1059 	if (file->f_flags & O_NOATIME)
1060 		return true;
1061 
1062 	if (inode->i_flags & S_NOATIME)
1063 		return true;
1064 
1065 	if (IS_NOATIME(inode))
1066 		return true;
1067 
1068 	if (mnt->mnt_flags & (MNT_NOATIME | MNT_READONLY))
1069 		return true;
1070 
1071 	if ((mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode))
1072 		return true;
1073 
1074 	if ((inode->i_sb->s_flags & MS_NODIRATIME) && S_ISDIR(inode->i_mode))
1075 		return true;
1076 
1077 	return false;
1078 }
1079 
ll_io_init(struct cl_io * io,const struct file * file,int write)1080 void ll_io_init(struct cl_io *io, const struct file *file, int write)
1081 {
1082 	struct inode *inode = file_inode(file);
1083 
1084 	io->u.ci_rw.crw_nonblock = file->f_flags & O_NONBLOCK;
1085 	if (write) {
1086 		io->u.ci_wr.wr_append = !!(file->f_flags & O_APPEND);
1087 		io->u.ci_wr.wr_sync = file->f_flags & O_SYNC ||
1088 				      file->f_flags & O_DIRECT ||
1089 				      IS_SYNC(inode);
1090 	}
1091 	io->ci_obj     = ll_i2info(inode)->lli_clob;
1092 	io->ci_lockreq = CILR_MAYBE;
1093 	if (ll_file_nolock(file)) {
1094 		io->ci_lockreq = CILR_NEVER;
1095 		io->ci_no_srvlock = 1;
1096 	} else if (file->f_flags & O_APPEND) {
1097 		io->ci_lockreq = CILR_MANDATORY;
1098 	}
1099 
1100 	io->ci_noatime = file_is_noatime(file);
1101 }
1102 
1103 static ssize_t
ll_file_io_generic(const struct lu_env * env,struct vvp_io_args * args,struct file * file,enum cl_io_type iot,loff_t * ppos,size_t count)1104 ll_file_io_generic(const struct lu_env *env, struct vvp_io_args *args,
1105 		   struct file *file, enum cl_io_type iot,
1106 		   loff_t *ppos, size_t count)
1107 {
1108 	struct ll_inode_info *lli = ll_i2info(file_inode(file));
1109 	struct ll_file_data  *fd  = LUSTRE_FPRIVATE(file);
1110 	struct cl_io	 *io;
1111 	ssize_t	       result;
1112 
1113 restart:
1114 	io = ccc_env_thread_io(env);
1115 	ll_io_init(io, file, iot == CIT_WRITE);
1116 
1117 	if (cl_io_rw_init(env, io, iot, *ppos, count) == 0) {
1118 		struct vvp_io *vio = vvp_env_io(env);
1119 		struct ccc_io *cio = ccc_env_io(env);
1120 		int write_mutex_locked = 0;
1121 
1122 		cio->cui_fd  = LUSTRE_FPRIVATE(file);
1123 		vio->cui_io_subtype = args->via_io_subtype;
1124 
1125 		switch (vio->cui_io_subtype) {
1126 		case IO_NORMAL:
1127 			cio->cui_iter = args->u.normal.via_iter;
1128 			cio->cui_iocb = args->u.normal.via_iocb;
1129 			if ((iot == CIT_WRITE) &&
1130 			    !(cio->cui_fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1131 				if (mutex_lock_interruptible(&lli->
1132 							       lli_write_mutex)) {
1133 					result = -ERESTARTSYS;
1134 					goto out;
1135 				}
1136 				write_mutex_locked = 1;
1137 			} else if (iot == CIT_READ) {
1138 				down_read(&lli->lli_trunc_sem);
1139 			}
1140 			break;
1141 		case IO_SPLICE:
1142 			vio->u.splice.cui_pipe = args->u.splice.via_pipe;
1143 			vio->u.splice.cui_flags = args->u.splice.via_flags;
1144 			break;
1145 		default:
1146 			CERROR("Unknown IO type - %u\n", vio->cui_io_subtype);
1147 			LBUG();
1148 		}
1149 		result = cl_io_loop(env, io);
1150 		if (write_mutex_locked)
1151 			mutex_unlock(&lli->lli_write_mutex);
1152 		else if (args->via_io_subtype == IO_NORMAL && iot == CIT_READ)
1153 			up_read(&lli->lli_trunc_sem);
1154 	} else {
1155 		/* cl_io_rw_init() handled IO */
1156 		result = io->ci_result;
1157 	}
1158 
1159 	if (io->ci_nob > 0) {
1160 		result = io->ci_nob;
1161 		*ppos = io->u.ci_wr.wr.crw_pos;
1162 	}
1163 	goto out;
1164 out:
1165 	cl_io_fini(env, io);
1166 	/* If any bit been read/written (result != 0), we just return
1167 	 * short read/write instead of restart io. */
1168 	if ((result == 0 || result == -ENODATA) && io->ci_need_restart) {
1169 		CDEBUG(D_VFSTRACE, "Restart %s on %pD from %lld, count:%zd\n",
1170 		       iot == CIT_READ ? "read" : "write",
1171 		       file, *ppos, count);
1172 		LASSERTF(io->ci_nob == 0, "%zd", io->ci_nob);
1173 		goto restart;
1174 	}
1175 
1176 	if (iot == CIT_READ) {
1177 		if (result >= 0)
1178 			ll_stats_ops_tally(ll_i2sbi(file_inode(file)),
1179 					   LPROC_LL_READ_BYTES, result);
1180 	} else if (iot == CIT_WRITE) {
1181 		if (result >= 0) {
1182 			ll_stats_ops_tally(ll_i2sbi(file_inode(file)),
1183 					   LPROC_LL_WRITE_BYTES, result);
1184 			fd->fd_write_failed = false;
1185 		} else if (result != -ERESTARTSYS) {
1186 			fd->fd_write_failed = true;
1187 		}
1188 	}
1189 
1190 	return result;
1191 }
1192 
ll_file_read_iter(struct kiocb * iocb,struct iov_iter * to)1193 static ssize_t ll_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
1194 {
1195 	struct lu_env      *env;
1196 	struct vvp_io_args *args;
1197 	ssize_t	     result;
1198 	int		 refcheck;
1199 
1200 	env = cl_env_get(&refcheck);
1201 	if (IS_ERR(env))
1202 		return PTR_ERR(env);
1203 
1204 	args = vvp_env_args(env, IO_NORMAL);
1205 	args->u.normal.via_iter = to;
1206 	args->u.normal.via_iocb = iocb;
1207 
1208 	result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_READ,
1209 				    &iocb->ki_pos, iov_iter_count(to));
1210 	cl_env_put(env, &refcheck);
1211 	return result;
1212 }
1213 
1214 /*
1215  * Write to a file (through the page cache).
1216  */
ll_file_write_iter(struct kiocb * iocb,struct iov_iter * from)1217 static ssize_t ll_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
1218 {
1219 	struct lu_env      *env;
1220 	struct vvp_io_args *args;
1221 	ssize_t	     result;
1222 	int		 refcheck;
1223 
1224 	env = cl_env_get(&refcheck);
1225 	if (IS_ERR(env))
1226 		return PTR_ERR(env);
1227 
1228 	args = vvp_env_args(env, IO_NORMAL);
1229 	args->u.normal.via_iter = from;
1230 	args->u.normal.via_iocb = iocb;
1231 
1232 	result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_WRITE,
1233 				  &iocb->ki_pos, iov_iter_count(from));
1234 	cl_env_put(env, &refcheck);
1235 	return result;
1236 }
1237 
1238 /*
1239  * Send file content (through pagecache) somewhere with helper
1240  */
ll_file_splice_read(struct file * in_file,loff_t * ppos,struct pipe_inode_info * pipe,size_t count,unsigned int flags)1241 static ssize_t ll_file_splice_read(struct file *in_file, loff_t *ppos,
1242 				   struct pipe_inode_info *pipe, size_t count,
1243 				   unsigned int flags)
1244 {
1245 	struct lu_env      *env;
1246 	struct vvp_io_args *args;
1247 	ssize_t	     result;
1248 	int		 refcheck;
1249 
1250 	env = cl_env_get(&refcheck);
1251 	if (IS_ERR(env))
1252 		return PTR_ERR(env);
1253 
1254 	args = vvp_env_args(env, IO_SPLICE);
1255 	args->u.splice.via_pipe = pipe;
1256 	args->u.splice.via_flags = flags;
1257 
1258 	result = ll_file_io_generic(env, args, in_file, CIT_READ, ppos, count);
1259 	cl_env_put(env, &refcheck);
1260 	return result;
1261 }
1262 
ll_lov_recreate(struct inode * inode,struct ost_id * oi,u32 ost_idx)1263 static int ll_lov_recreate(struct inode *inode, struct ost_id *oi, u32 ost_idx)
1264 {
1265 	struct obd_export *exp = ll_i2dtexp(inode);
1266 	struct obd_trans_info oti = { 0 };
1267 	struct obdo *oa = NULL;
1268 	int lsm_size;
1269 	int rc = 0;
1270 	struct lov_stripe_md *lsm = NULL, *lsm2;
1271 
1272 	OBDO_ALLOC(oa);
1273 	if (oa == NULL)
1274 		return -ENOMEM;
1275 
1276 	lsm = ccc_inode_lsm_get(inode);
1277 	if (!lsm_has_objects(lsm)) {
1278 		rc = -ENOENT;
1279 		goto out;
1280 	}
1281 
1282 	lsm_size = sizeof(*lsm) + (sizeof(struct lov_oinfo) *
1283 		   (lsm->lsm_stripe_count));
1284 
1285 	OBD_ALLOC_LARGE(lsm2, lsm_size);
1286 	if (lsm2 == NULL) {
1287 		rc = -ENOMEM;
1288 		goto out;
1289 	}
1290 
1291 	oa->o_oi = *oi;
1292 	oa->o_nlink = ost_idx;
1293 	oa->o_flags |= OBD_FL_RECREATE_OBJS;
1294 	oa->o_valid = OBD_MD_FLID | OBD_MD_FLFLAGS | OBD_MD_FLGROUP;
1295 	obdo_from_inode(oa, inode, OBD_MD_FLTYPE | OBD_MD_FLATIME |
1296 				   OBD_MD_FLMTIME | OBD_MD_FLCTIME);
1297 	obdo_set_parent_fid(oa, &ll_i2info(inode)->lli_fid);
1298 	memcpy(lsm2, lsm, lsm_size);
1299 	ll_inode_size_lock(inode);
1300 	rc = obd_create(NULL, exp, oa, &lsm2, &oti);
1301 	ll_inode_size_unlock(inode);
1302 
1303 	OBD_FREE_LARGE(lsm2, lsm_size);
1304 	goto out;
1305 out:
1306 	ccc_inode_lsm_put(inode, lsm);
1307 	OBDO_FREE(oa);
1308 	return rc;
1309 }
1310 
ll_lov_recreate_obj(struct inode * inode,unsigned long arg)1311 static int ll_lov_recreate_obj(struct inode *inode, unsigned long arg)
1312 {
1313 	struct ll_recreate_obj ucreat;
1314 	struct ost_id		oi;
1315 
1316 	if (!capable(CFS_CAP_SYS_ADMIN))
1317 		return -EPERM;
1318 
1319 	if (copy_from_user(&ucreat, (struct ll_recreate_obj *)arg,
1320 			   sizeof(ucreat)))
1321 		return -EFAULT;
1322 
1323 	ostid_set_seq_mdt0(&oi);
1324 	ostid_set_id(&oi, ucreat.lrc_id);
1325 	return ll_lov_recreate(inode, &oi, ucreat.lrc_ost_idx);
1326 }
1327 
ll_lov_recreate_fid(struct inode * inode,unsigned long arg)1328 static int ll_lov_recreate_fid(struct inode *inode, unsigned long arg)
1329 {
1330 	struct lu_fid	fid;
1331 	struct ost_id	oi;
1332 	u32		ost_idx;
1333 
1334 	if (!capable(CFS_CAP_SYS_ADMIN))
1335 		return -EPERM;
1336 
1337 	if (copy_from_user(&fid, (struct lu_fid *)arg, sizeof(fid)))
1338 		return -EFAULT;
1339 
1340 	fid_to_ostid(&fid, &oi);
1341 	ost_idx = (fid_seq(&fid) >> 16) & 0xffff;
1342 	return ll_lov_recreate(inode, &oi, ost_idx);
1343 }
1344 
ll_lov_setstripe_ea_info(struct inode * inode,struct dentry * dentry,int flags,struct lov_user_md * lum,int lum_size)1345 int ll_lov_setstripe_ea_info(struct inode *inode, struct dentry *dentry,
1346 			     int flags, struct lov_user_md *lum, int lum_size)
1347 {
1348 	struct lov_stripe_md *lsm = NULL;
1349 	struct lookup_intent oit = {.it_op = IT_OPEN, .it_flags = flags};
1350 	int rc = 0;
1351 
1352 	lsm = ccc_inode_lsm_get(inode);
1353 	if (lsm != NULL) {
1354 		ccc_inode_lsm_put(inode, lsm);
1355 		CDEBUG(D_IOCTL, "stripe already exists for ino %lu\n",
1356 		       inode->i_ino);
1357 		rc = -EEXIST;
1358 		goto out;
1359 	}
1360 
1361 	ll_inode_size_lock(inode);
1362 	rc = ll_intent_file_open(dentry, lum, lum_size, &oit);
1363 	if (rc)
1364 		goto out_unlock;
1365 	rc = oit.d.lustre.it_status;
1366 	if (rc < 0)
1367 		goto out_req_free;
1368 
1369 	ll_release_openhandle(inode, &oit);
1370 
1371 out_unlock:
1372 	ll_inode_size_unlock(inode);
1373 	ll_intent_release(&oit);
1374 	ccc_inode_lsm_put(inode, lsm);
1375 out:
1376 	return rc;
1377 out_req_free:
1378 	ptlrpc_req_finished((struct ptlrpc_request *) oit.d.lustre.it_data);
1379 	goto out;
1380 }
1381 
ll_lov_getstripe_ea_info(struct inode * inode,const char * filename,struct lov_mds_md ** lmmp,int * lmm_size,struct ptlrpc_request ** request)1382 int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename,
1383 			     struct lov_mds_md **lmmp, int *lmm_size,
1384 			     struct ptlrpc_request **request)
1385 {
1386 	struct ll_sb_info *sbi = ll_i2sbi(inode);
1387 	struct mdt_body  *body;
1388 	struct lov_mds_md *lmm = NULL;
1389 	struct ptlrpc_request *req = NULL;
1390 	struct md_op_data *op_data;
1391 	int rc, lmmsize;
1392 
1393 	rc = ll_get_default_mdsize(sbi, &lmmsize);
1394 	if (rc)
1395 		return rc;
1396 
1397 	op_data = ll_prep_md_op_data(NULL, inode, NULL, filename,
1398 				     strlen(filename), lmmsize,
1399 				     LUSTRE_OPC_ANY, NULL);
1400 	if (IS_ERR(op_data))
1401 		return PTR_ERR(op_data);
1402 
1403 	op_data->op_valid = OBD_MD_FLEASIZE | OBD_MD_FLDIREA;
1404 	rc = md_getattr_name(sbi->ll_md_exp, op_data, &req);
1405 	ll_finish_md_op_data(op_data);
1406 	if (rc < 0) {
1407 		CDEBUG(D_INFO, "md_getattr_name failed on %s: rc %d\n",
1408 		       filename, rc);
1409 		goto out;
1410 	}
1411 
1412 	body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
1413 	LASSERT(body != NULL); /* checked by mdc_getattr_name */
1414 
1415 	lmmsize = body->eadatasize;
1416 
1417 	if (!(body->valid & (OBD_MD_FLEASIZE | OBD_MD_FLDIREA)) ||
1418 			lmmsize == 0) {
1419 		rc = -ENODATA;
1420 		goto out;
1421 	}
1422 
1423 	lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_MDT_MD, lmmsize);
1424 	LASSERT(lmm != NULL);
1425 
1426 	if ((lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V1)) &&
1427 	    (lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V3))) {
1428 		rc = -EPROTO;
1429 		goto out;
1430 	}
1431 
1432 	/*
1433 	 * This is coming from the MDS, so is probably in
1434 	 * little endian.  We convert it to host endian before
1435 	 * passing it to userspace.
1436 	 */
1437 	if (LOV_MAGIC != cpu_to_le32(LOV_MAGIC)) {
1438 		int stripe_count;
1439 
1440 		stripe_count = le16_to_cpu(lmm->lmm_stripe_count);
1441 		if (le32_to_cpu(lmm->lmm_pattern) & LOV_PATTERN_F_RELEASED)
1442 			stripe_count = 0;
1443 
1444 		/* if function called for directory - we should
1445 		 * avoid swab not existent lsm objects */
1446 		if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1)) {
1447 			lustre_swab_lov_user_md_v1((struct lov_user_md_v1 *)lmm);
1448 			if (S_ISREG(body->mode))
1449 				lustre_swab_lov_user_md_objects(
1450 				 ((struct lov_user_md_v1 *)lmm)->lmm_objects,
1451 				 stripe_count);
1452 		} else if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
1453 			lustre_swab_lov_user_md_v3((struct lov_user_md_v3 *)lmm);
1454 			if (S_ISREG(body->mode))
1455 				lustre_swab_lov_user_md_objects(
1456 				 ((struct lov_user_md_v3 *)lmm)->lmm_objects,
1457 				 stripe_count);
1458 		}
1459 	}
1460 
1461 out:
1462 	*lmmp = lmm;
1463 	*lmm_size = lmmsize;
1464 	*request = req;
1465 	return rc;
1466 }
1467 
ll_lov_setea(struct inode * inode,struct file * file,unsigned long arg)1468 static int ll_lov_setea(struct inode *inode, struct file *file,
1469 			    unsigned long arg)
1470 {
1471 	int			 flags = MDS_OPEN_HAS_OBJS | FMODE_WRITE;
1472 	struct lov_user_md	*lump;
1473 	int			 lum_size = sizeof(struct lov_user_md) +
1474 					    sizeof(struct lov_user_ost_data);
1475 	int			 rc;
1476 
1477 	if (!capable(CFS_CAP_SYS_ADMIN))
1478 		return -EPERM;
1479 
1480 	OBD_ALLOC_LARGE(lump, lum_size);
1481 	if (lump == NULL)
1482 		return -ENOMEM;
1483 
1484 	if (copy_from_user(lump, (struct lov_user_md *)arg, lum_size)) {
1485 		OBD_FREE_LARGE(lump, lum_size);
1486 		return -EFAULT;
1487 	}
1488 
1489 	rc = ll_lov_setstripe_ea_info(inode, file->f_path.dentry, flags, lump,
1490 				     lum_size);
1491 	cl_lov_delay_create_clear(&file->f_flags);
1492 
1493 	OBD_FREE_LARGE(lump, lum_size);
1494 	return rc;
1495 }
1496 
ll_lov_setstripe(struct inode * inode,struct file * file,unsigned long arg)1497 static int ll_lov_setstripe(struct inode *inode, struct file *file,
1498 			    unsigned long arg)
1499 {
1500 	struct lov_user_md_v3	 lumv3;
1501 	struct lov_user_md_v1	*lumv1 = (struct lov_user_md_v1 *)&lumv3;
1502 	struct lov_user_md_v1	*lumv1p = (struct lov_user_md_v1 *)arg;
1503 	struct lov_user_md_v3	*lumv3p = (struct lov_user_md_v3 *)arg;
1504 	int			 lum_size, rc;
1505 	int			 flags = FMODE_WRITE;
1506 
1507 	/* first try with v1 which is smaller than v3 */
1508 	lum_size = sizeof(struct lov_user_md_v1);
1509 	if (copy_from_user(lumv1, lumv1p, lum_size))
1510 		return -EFAULT;
1511 
1512 	if (lumv1->lmm_magic == LOV_USER_MAGIC_V3) {
1513 		lum_size = sizeof(struct lov_user_md_v3);
1514 		if (copy_from_user(&lumv3, lumv3p, lum_size))
1515 			return -EFAULT;
1516 	}
1517 
1518 	rc = ll_lov_setstripe_ea_info(inode, file->f_path.dentry, flags, lumv1,
1519 				      lum_size);
1520 	cl_lov_delay_create_clear(&file->f_flags);
1521 	if (rc == 0) {
1522 		struct lov_stripe_md *lsm;
1523 		__u32 gen;
1524 
1525 		put_user(0, &lumv1p->lmm_stripe_count);
1526 
1527 		ll_layout_refresh(inode, &gen);
1528 		lsm = ccc_inode_lsm_get(inode);
1529 		rc = obd_iocontrol(LL_IOC_LOV_GETSTRIPE, ll_i2dtexp(inode),
1530 				   0, lsm, (void *)arg);
1531 		ccc_inode_lsm_put(inode, lsm);
1532 	}
1533 	return rc;
1534 }
1535 
ll_lov_getstripe(struct inode * inode,unsigned long arg)1536 static int ll_lov_getstripe(struct inode *inode, unsigned long arg)
1537 {
1538 	struct lov_stripe_md *lsm;
1539 	int rc = -ENODATA;
1540 
1541 	lsm = ccc_inode_lsm_get(inode);
1542 	if (lsm != NULL)
1543 		rc = obd_iocontrol(LL_IOC_LOV_GETSTRIPE, ll_i2dtexp(inode), 0,
1544 				   lsm, (void *)arg);
1545 	ccc_inode_lsm_put(inode, lsm);
1546 	return rc;
1547 }
1548 
1549 static int
ll_get_grouplock(struct inode * inode,struct file * file,unsigned long arg)1550 ll_get_grouplock(struct inode *inode, struct file *file, unsigned long arg)
1551 {
1552 	struct ll_inode_info   *lli = ll_i2info(inode);
1553 	struct ll_file_data    *fd = LUSTRE_FPRIVATE(file);
1554 	struct ccc_grouplock    grouplock;
1555 	int		     rc;
1556 
1557 	if (arg == 0) {
1558 		CWARN("group id for group lock must not be 0\n");
1559 		return -EINVAL;
1560 	}
1561 
1562 	if (ll_file_nolock(file))
1563 		return -EOPNOTSUPP;
1564 
1565 	spin_lock(&lli->lli_lock);
1566 	if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1567 		CWARN("group lock already existed with gid %lu\n",
1568 		      fd->fd_grouplock.cg_gid);
1569 		spin_unlock(&lli->lli_lock);
1570 		return -EINVAL;
1571 	}
1572 	LASSERT(fd->fd_grouplock.cg_lock == NULL);
1573 	spin_unlock(&lli->lli_lock);
1574 
1575 	rc = cl_get_grouplock(cl_i2info(inode)->lli_clob,
1576 			      arg, (file->f_flags & O_NONBLOCK), &grouplock);
1577 	if (rc)
1578 		return rc;
1579 
1580 	spin_lock(&lli->lli_lock);
1581 	if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1582 		spin_unlock(&lli->lli_lock);
1583 		CERROR("another thread just won the race\n");
1584 		cl_put_grouplock(&grouplock);
1585 		return -EINVAL;
1586 	}
1587 
1588 	fd->fd_flags |= LL_FILE_GROUP_LOCKED;
1589 	fd->fd_grouplock = grouplock;
1590 	spin_unlock(&lli->lli_lock);
1591 
1592 	CDEBUG(D_INFO, "group lock %lu obtained\n", arg);
1593 	return 0;
1594 }
1595 
ll_put_grouplock(struct inode * inode,struct file * file,unsigned long arg)1596 static int ll_put_grouplock(struct inode *inode, struct file *file,
1597 			    unsigned long arg)
1598 {
1599 	struct ll_inode_info   *lli = ll_i2info(inode);
1600 	struct ll_file_data    *fd = LUSTRE_FPRIVATE(file);
1601 	struct ccc_grouplock    grouplock;
1602 
1603 	spin_lock(&lli->lli_lock);
1604 	if (!(fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1605 		spin_unlock(&lli->lli_lock);
1606 		CWARN("no group lock held\n");
1607 		return -EINVAL;
1608 	}
1609 	LASSERT(fd->fd_grouplock.cg_lock != NULL);
1610 
1611 	if (fd->fd_grouplock.cg_gid != arg) {
1612 		CWARN("group lock %lu doesn't match current id %lu\n",
1613 		       arg, fd->fd_grouplock.cg_gid);
1614 		spin_unlock(&lli->lli_lock);
1615 		return -EINVAL;
1616 	}
1617 
1618 	grouplock = fd->fd_grouplock;
1619 	memset(&fd->fd_grouplock, 0, sizeof(fd->fd_grouplock));
1620 	fd->fd_flags &= ~LL_FILE_GROUP_LOCKED;
1621 	spin_unlock(&lli->lli_lock);
1622 
1623 	cl_put_grouplock(&grouplock);
1624 	CDEBUG(D_INFO, "group lock %lu released\n", arg);
1625 	return 0;
1626 }
1627 
1628 /**
1629  * Close inode open handle
1630  *
1631  * \param inode  [in]     inode in question
1632  * \param it     [in,out] intent which contains open info and result
1633  *
1634  * \retval 0     success
1635  * \retval <0    failure
1636  */
ll_release_openhandle(struct inode * inode,struct lookup_intent * it)1637 int ll_release_openhandle(struct inode *inode, struct lookup_intent *it)
1638 {
1639 	struct obd_client_handle *och;
1640 	int rc;
1641 
1642 	LASSERT(inode);
1643 
1644 	/* Root ? Do nothing. */
1645 	if (is_root_inode(inode))
1646 		return 0;
1647 
1648 	/* No open handle to close? Move away */
1649 	if (!it_disposition(it, DISP_OPEN_OPEN))
1650 		return 0;
1651 
1652 	LASSERT(it_open_error(DISP_OPEN_OPEN, it) == 0);
1653 
1654 	och = kzalloc(sizeof(*och), GFP_NOFS);
1655 	if (!och) {
1656 		rc = -ENOMEM;
1657 		goto out;
1658 	}
1659 
1660 	ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
1661 
1662 	rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp,
1663 				       inode, och, NULL);
1664 out:
1665 	/* this one is in place of ll_file_open */
1666 	if (it_disposition(it, DISP_ENQ_OPEN_REF)) {
1667 		ptlrpc_req_finished(it->d.lustre.it_data);
1668 		it_clear_disposition(it, DISP_ENQ_OPEN_REF);
1669 	}
1670 	return rc;
1671 }
1672 
1673 /**
1674  * Get size for inode for which FIEMAP mapping is requested.
1675  * Make the FIEMAP get_info call and returns the result.
1676  */
ll_do_fiemap(struct inode * inode,struct ll_user_fiemap * fiemap,size_t num_bytes)1677 static int ll_do_fiemap(struct inode *inode, struct ll_user_fiemap *fiemap,
1678 			size_t num_bytes)
1679 {
1680 	struct obd_export *exp = ll_i2dtexp(inode);
1681 	struct lov_stripe_md *lsm = NULL;
1682 	struct ll_fiemap_info_key fm_key = { .name = KEY_FIEMAP, };
1683 	__u32 vallen = num_bytes;
1684 	int rc;
1685 
1686 	/* Checks for fiemap flags */
1687 	if (fiemap->fm_flags & ~LUSTRE_FIEMAP_FLAGS_COMPAT) {
1688 		fiemap->fm_flags &= ~LUSTRE_FIEMAP_FLAGS_COMPAT;
1689 		return -EBADR;
1690 	}
1691 
1692 	/* Check for FIEMAP_FLAG_SYNC */
1693 	if (fiemap->fm_flags & FIEMAP_FLAG_SYNC) {
1694 		rc = filemap_fdatawrite(inode->i_mapping);
1695 		if (rc)
1696 			return rc;
1697 	}
1698 
1699 	lsm = ccc_inode_lsm_get(inode);
1700 	if (lsm == NULL)
1701 		return -ENOENT;
1702 
1703 	/* If the stripe_count > 1 and the application does not understand
1704 	 * DEVICE_ORDER flag, then it cannot interpret the extents correctly.
1705 	 */
1706 	if (lsm->lsm_stripe_count > 1 &&
1707 	    !(fiemap->fm_flags & FIEMAP_FLAG_DEVICE_ORDER)) {
1708 		rc = -EOPNOTSUPP;
1709 		goto out;
1710 	}
1711 
1712 	fm_key.oa.o_oi = lsm->lsm_oi;
1713 	fm_key.oa.o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
1714 
1715 	if (i_size_read(inode) == 0) {
1716 		rc = ll_glimpse_size(inode);
1717 		if (rc)
1718 			goto out;
1719 	}
1720 
1721 	obdo_from_inode(&fm_key.oa, inode, OBD_MD_FLSIZE);
1722 	obdo_set_parent_fid(&fm_key.oa, &ll_i2info(inode)->lli_fid);
1723 	/* If filesize is 0, then there would be no objects for mapping */
1724 	if (fm_key.oa.o_size == 0) {
1725 		fiemap->fm_mapped_extents = 0;
1726 		rc = 0;
1727 		goto out;
1728 	}
1729 
1730 	memcpy(&fm_key.fiemap, fiemap, sizeof(*fiemap));
1731 
1732 	rc = obd_get_info(NULL, exp, sizeof(fm_key), &fm_key, &vallen,
1733 			  fiemap, lsm);
1734 	if (rc)
1735 		CERROR("obd_get_info failed: rc = %d\n", rc);
1736 
1737 out:
1738 	ccc_inode_lsm_put(inode, lsm);
1739 	return rc;
1740 }
1741 
ll_fid2path(struct inode * inode,void __user * arg)1742 int ll_fid2path(struct inode *inode, void __user *arg)
1743 {
1744 	struct obd_export *exp = ll_i2mdexp(inode);
1745 	const struct getinfo_fid2path __user *gfin = arg;
1746 	struct getinfo_fid2path *gfout;
1747 	u32 pathlen;
1748 	size_t outsize;
1749 	int rc;
1750 
1751 	if (!capable(CFS_CAP_DAC_READ_SEARCH) &&
1752 	    !(ll_i2sbi(inode)->ll_flags & LL_SBI_USER_FID2PATH))
1753 		return -EPERM;
1754 
1755 	/* Only need to get the buflen */
1756 	if (get_user(pathlen, &gfin->gf_pathlen))
1757 		return -EFAULT;
1758 
1759 	if (pathlen > PATH_MAX)
1760 		return -EINVAL;
1761 
1762 	outsize = sizeof(*gfout) + pathlen;
1763 
1764 	gfout = kzalloc(outsize, GFP_NOFS);
1765 	if (!gfout)
1766 		return -ENOMEM;
1767 
1768 	if (copy_from_user(gfout, arg, sizeof(*gfout))) {
1769 		rc = -EFAULT;
1770 		goto gf_free;
1771 	}
1772 
1773 	/* Call mdc_iocontrol */
1774 	rc = obd_iocontrol(OBD_IOC_FID2PATH, exp, outsize, gfout, NULL);
1775 	if (rc != 0)
1776 		goto gf_free;
1777 
1778 	if (copy_to_user(arg, gfout, outsize))
1779 		rc = -EFAULT;
1780 
1781 gf_free:
1782 	OBD_FREE(gfout, outsize);
1783 	return rc;
1784 }
1785 
ll_ioctl_fiemap(struct inode * inode,unsigned long arg)1786 static int ll_ioctl_fiemap(struct inode *inode, unsigned long arg)
1787 {
1788 	struct ll_user_fiemap *fiemap_s;
1789 	size_t num_bytes, ret_bytes;
1790 	unsigned int extent_count;
1791 	int rc = 0;
1792 
1793 	/* Get the extent count so we can calculate the size of
1794 	 * required fiemap buffer */
1795 	if (get_user(extent_count,
1796 	    &((struct ll_user_fiemap __user *)arg)->fm_extent_count))
1797 		return -EFAULT;
1798 
1799 	if (extent_count >=
1800 	    (SIZE_MAX - sizeof(*fiemap_s)) / sizeof(struct ll_fiemap_extent))
1801 		return -EINVAL;
1802 	num_bytes = sizeof(*fiemap_s) + (extent_count *
1803 					 sizeof(struct ll_fiemap_extent));
1804 
1805 	OBD_ALLOC_LARGE(fiemap_s, num_bytes);
1806 	if (fiemap_s == NULL)
1807 		return -ENOMEM;
1808 
1809 	/* get the fiemap value */
1810 	if (copy_from_user(fiemap_s, (struct ll_user_fiemap __user *)arg,
1811 			   sizeof(*fiemap_s))) {
1812 		rc = -EFAULT;
1813 		goto error;
1814 	}
1815 
1816 	/* If fm_extent_count is non-zero, read the first extent since
1817 	 * it is used to calculate end_offset and device from previous
1818 	 * fiemap call. */
1819 	if (extent_count) {
1820 		if (copy_from_user(&fiemap_s->fm_extents[0],
1821 		    (char __user *)arg + sizeof(*fiemap_s),
1822 		    sizeof(struct ll_fiemap_extent))) {
1823 			rc = -EFAULT;
1824 			goto error;
1825 		}
1826 	}
1827 
1828 	rc = ll_do_fiemap(inode, fiemap_s, num_bytes);
1829 	if (rc)
1830 		goto error;
1831 
1832 	ret_bytes = sizeof(struct ll_user_fiemap);
1833 
1834 	if (extent_count != 0)
1835 		ret_bytes += (fiemap_s->fm_mapped_extents *
1836 				 sizeof(struct ll_fiemap_extent));
1837 
1838 	if (copy_to_user((void *)arg, fiemap_s, ret_bytes))
1839 		rc = -EFAULT;
1840 
1841 error:
1842 	OBD_FREE_LARGE(fiemap_s, num_bytes);
1843 	return rc;
1844 }
1845 
1846 /*
1847  * Read the data_version for inode.
1848  *
1849  * This value is computed using stripe object version on OST.
1850  * Version is computed using server side locking.
1851  *
1852  * @param extent_lock  Take extent lock. Not needed if a process is already
1853  *		       holding the OST object group locks.
1854  */
ll_data_version(struct inode * inode,__u64 * data_version,int extent_lock)1855 int ll_data_version(struct inode *inode, __u64 *data_version,
1856 		    int extent_lock)
1857 {
1858 	struct lov_stripe_md	*lsm = NULL;
1859 	struct ll_sb_info	*sbi = ll_i2sbi(inode);
1860 	struct obdo		*obdo = NULL;
1861 	int			 rc;
1862 
1863 	/* If no stripe, we consider version is 0. */
1864 	lsm = ccc_inode_lsm_get(inode);
1865 	if (!lsm_has_objects(lsm)) {
1866 		*data_version = 0;
1867 		CDEBUG(D_INODE, "No object for inode\n");
1868 		rc = 0;
1869 		goto out;
1870 	}
1871 
1872 	obdo = kzalloc(sizeof(*obdo), GFP_NOFS);
1873 	if (!obdo) {
1874 		rc = -ENOMEM;
1875 		goto out;
1876 	}
1877 
1878 	rc = ll_lsm_getattr(lsm, sbi->ll_dt_exp, NULL, obdo, 0, extent_lock);
1879 	if (rc == 0) {
1880 		if (!(obdo->o_valid & OBD_MD_FLDATAVERSION))
1881 			rc = -EOPNOTSUPP;
1882 		else
1883 			*data_version = obdo->o_data_version;
1884 	}
1885 
1886 	OBD_FREE_PTR(obdo);
1887 out:
1888 	ccc_inode_lsm_put(inode, lsm);
1889 	return rc;
1890 }
1891 
1892 /*
1893  * Trigger a HSM release request for the provided inode.
1894  */
ll_hsm_release(struct inode * inode)1895 int ll_hsm_release(struct inode *inode)
1896 {
1897 	struct cl_env_nest nest;
1898 	struct lu_env *env;
1899 	struct obd_client_handle *och = NULL;
1900 	__u64 data_version = 0;
1901 	int rc;
1902 
1903 
1904 	CDEBUG(D_INODE, "%s: Releasing file "DFID".\n",
1905 	       ll_get_fsname(inode->i_sb, NULL, 0),
1906 	       PFID(&ll_i2info(inode)->lli_fid));
1907 
1908 	och = ll_lease_open(inode, NULL, FMODE_WRITE, MDS_OPEN_RELEASE);
1909 	if (IS_ERR(och)) {
1910 		rc = PTR_ERR(och);
1911 		goto out;
1912 	}
1913 
1914 	/* Grab latest data_version and [am]time values */
1915 	rc = ll_data_version(inode, &data_version, 1);
1916 	if (rc != 0)
1917 		goto out;
1918 
1919 	env = cl_env_nested_get(&nest);
1920 	if (IS_ERR(env)) {
1921 		rc = PTR_ERR(env);
1922 		goto out;
1923 	}
1924 
1925 	ll_merge_lvb(env, inode);
1926 	cl_env_nested_put(&nest, env);
1927 
1928 	/* Release the file.
1929 	 * NB: lease lock handle is released in mdc_hsm_release_pack() because
1930 	 * we still need it to pack l_remote_handle to MDT. */
1931 	rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp, inode, och,
1932 				       &data_version);
1933 	och = NULL;
1934 
1935 
1936 out:
1937 	if (och != NULL && !IS_ERR(och)) /* close the file */
1938 		ll_lease_close(och, inode, NULL);
1939 
1940 	return rc;
1941 }
1942 
1943 struct ll_swap_stack {
1944 	struct iattr		 ia1, ia2;
1945 	__u64			 dv1, dv2;
1946 	struct inode		*inode1, *inode2;
1947 	bool			 check_dv1, check_dv2;
1948 };
1949 
ll_swap_layouts(struct file * file1,struct file * file2,struct lustre_swap_layouts * lsl)1950 static int ll_swap_layouts(struct file *file1, struct file *file2,
1951 			   struct lustre_swap_layouts *lsl)
1952 {
1953 	struct mdc_swap_layouts	 msl;
1954 	struct md_op_data	*op_data;
1955 	__u32			 gid;
1956 	__u64			 dv;
1957 	struct ll_swap_stack	*llss = NULL;
1958 	int			 rc;
1959 
1960 	llss = kzalloc(sizeof(*llss), GFP_NOFS);
1961 	if (!llss)
1962 		return -ENOMEM;
1963 
1964 	llss->inode1 = file_inode(file1);
1965 	llss->inode2 = file_inode(file2);
1966 
1967 	if (!S_ISREG(llss->inode2->i_mode)) {
1968 		rc = -EINVAL;
1969 		goto free;
1970 	}
1971 
1972 	if (inode_permission(llss->inode1, MAY_WRITE) ||
1973 	    inode_permission(llss->inode2, MAY_WRITE)) {
1974 		rc = -EPERM;
1975 		goto free;
1976 	}
1977 
1978 	if (llss->inode2->i_sb != llss->inode1->i_sb) {
1979 		rc = -EXDEV;
1980 		goto free;
1981 	}
1982 
1983 	/* we use 2 bool because it is easier to swap than 2 bits */
1984 	if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV1)
1985 		llss->check_dv1 = true;
1986 
1987 	if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV2)
1988 		llss->check_dv2 = true;
1989 
1990 	/* we cannot use lsl->sl_dvX directly because we may swap them */
1991 	llss->dv1 = lsl->sl_dv1;
1992 	llss->dv2 = lsl->sl_dv2;
1993 
1994 	rc = lu_fid_cmp(ll_inode2fid(llss->inode1), ll_inode2fid(llss->inode2));
1995 	if (rc == 0) /* same file, done! */ {
1996 		rc = 0;
1997 		goto free;
1998 	}
1999 
2000 	if (rc < 0) { /* sequentialize it */
2001 		swap(llss->inode1, llss->inode2);
2002 		swap(file1, file2);
2003 		swap(llss->dv1, llss->dv2);
2004 		swap(llss->check_dv1, llss->check_dv2);
2005 	}
2006 
2007 	gid = lsl->sl_gid;
2008 	if (gid != 0) { /* application asks to flush dirty cache */
2009 		rc = ll_get_grouplock(llss->inode1, file1, gid);
2010 		if (rc < 0)
2011 			goto free;
2012 
2013 		rc = ll_get_grouplock(llss->inode2, file2, gid);
2014 		if (rc < 0) {
2015 			ll_put_grouplock(llss->inode1, file1, gid);
2016 			goto free;
2017 		}
2018 	}
2019 
2020 	/* to be able to restore mtime and atime after swap
2021 	 * we need to first save them */
2022 	if (lsl->sl_flags &
2023 	    (SWAP_LAYOUTS_KEEP_MTIME | SWAP_LAYOUTS_KEEP_ATIME)) {
2024 		llss->ia1.ia_mtime = llss->inode1->i_mtime;
2025 		llss->ia1.ia_atime = llss->inode1->i_atime;
2026 		llss->ia1.ia_valid = ATTR_MTIME | ATTR_ATIME;
2027 		llss->ia2.ia_mtime = llss->inode2->i_mtime;
2028 		llss->ia2.ia_atime = llss->inode2->i_atime;
2029 		llss->ia2.ia_valid = ATTR_MTIME | ATTR_ATIME;
2030 	}
2031 
2032 	/* ultimate check, before swapping the layouts we check if
2033 	 * dataversion has changed (if requested) */
2034 	if (llss->check_dv1) {
2035 		rc = ll_data_version(llss->inode1, &dv, 0);
2036 		if (rc)
2037 			goto putgl;
2038 		if (dv != llss->dv1) {
2039 			rc = -EAGAIN;
2040 			goto putgl;
2041 		}
2042 	}
2043 
2044 	if (llss->check_dv2) {
2045 		rc = ll_data_version(llss->inode2, &dv, 0);
2046 		if (rc)
2047 			goto putgl;
2048 		if (dv != llss->dv2) {
2049 			rc = -EAGAIN;
2050 			goto putgl;
2051 		}
2052 	}
2053 
2054 	/* struct md_op_data is used to send the swap args to the mdt
2055 	 * only flags is missing, so we use struct mdc_swap_layouts
2056 	 * through the md_op_data->op_data */
2057 	/* flags from user space have to be converted before they are send to
2058 	 * server, no flag is sent today, they are only used on the client */
2059 	msl.msl_flags = 0;
2060 	rc = -ENOMEM;
2061 	op_data = ll_prep_md_op_data(NULL, llss->inode1, llss->inode2, NULL, 0,
2062 				     0, LUSTRE_OPC_ANY, &msl);
2063 	if (IS_ERR(op_data)) {
2064 		rc = PTR_ERR(op_data);
2065 		goto free;
2066 	}
2067 
2068 	rc = obd_iocontrol(LL_IOC_LOV_SWAP_LAYOUTS, ll_i2mdexp(llss->inode1),
2069 			   sizeof(*op_data), op_data, NULL);
2070 	ll_finish_md_op_data(op_data);
2071 
2072 putgl:
2073 	if (gid != 0) {
2074 		ll_put_grouplock(llss->inode2, file2, gid);
2075 		ll_put_grouplock(llss->inode1, file1, gid);
2076 	}
2077 
2078 	/* rc can be set from obd_iocontrol() or from a GOTO(putgl, ...) */
2079 	if (rc != 0)
2080 		goto free;
2081 
2082 	/* clear useless flags */
2083 	if (!(lsl->sl_flags & SWAP_LAYOUTS_KEEP_MTIME)) {
2084 		llss->ia1.ia_valid &= ~ATTR_MTIME;
2085 		llss->ia2.ia_valid &= ~ATTR_MTIME;
2086 	}
2087 
2088 	if (!(lsl->sl_flags & SWAP_LAYOUTS_KEEP_ATIME)) {
2089 		llss->ia1.ia_valid &= ~ATTR_ATIME;
2090 		llss->ia2.ia_valid &= ~ATTR_ATIME;
2091 	}
2092 
2093 	/* update time if requested */
2094 	rc = 0;
2095 	if (llss->ia2.ia_valid != 0) {
2096 		mutex_lock(&llss->inode1->i_mutex);
2097 		rc = ll_setattr(file1->f_path.dentry, &llss->ia2);
2098 		mutex_unlock(&llss->inode1->i_mutex);
2099 	}
2100 
2101 	if (llss->ia1.ia_valid != 0) {
2102 		int rc1;
2103 
2104 		mutex_lock(&llss->inode2->i_mutex);
2105 		rc1 = ll_setattr(file2->f_path.dentry, &llss->ia1);
2106 		mutex_unlock(&llss->inode2->i_mutex);
2107 		if (rc == 0)
2108 			rc = rc1;
2109 	}
2110 
2111 free:
2112 	if (llss != NULL)
2113 		OBD_FREE_PTR(llss);
2114 
2115 	return rc;
2116 }
2117 
ll_hsm_state_set(struct inode * inode,struct hsm_state_set * hss)2118 static int ll_hsm_state_set(struct inode *inode, struct hsm_state_set *hss)
2119 {
2120 	struct md_op_data	*op_data;
2121 	int			 rc;
2122 
2123 	/* Non-root users are forbidden to set or clear flags which are
2124 	 * NOT defined in HSM_USER_MASK. */
2125 	if (((hss->hss_setmask | hss->hss_clearmask) & ~HSM_USER_MASK) &&
2126 	    !capable(CFS_CAP_SYS_ADMIN))
2127 		return -EPERM;
2128 
2129 	op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2130 				     LUSTRE_OPC_ANY, hss);
2131 	if (IS_ERR(op_data))
2132 		return PTR_ERR(op_data);
2133 
2134 	rc = obd_iocontrol(LL_IOC_HSM_STATE_SET, ll_i2mdexp(inode),
2135 			   sizeof(*op_data), op_data, NULL);
2136 
2137 	ll_finish_md_op_data(op_data);
2138 
2139 	return rc;
2140 }
2141 
ll_hsm_import(struct inode * inode,struct file * file,struct hsm_user_import * hui)2142 static int ll_hsm_import(struct inode *inode, struct file *file,
2143 			 struct hsm_user_import *hui)
2144 {
2145 	struct hsm_state_set	*hss = NULL;
2146 	struct iattr		*attr = NULL;
2147 	int			 rc;
2148 
2149 
2150 	if (!S_ISREG(inode->i_mode))
2151 		return -EINVAL;
2152 
2153 	/* set HSM flags */
2154 	hss = kzalloc(sizeof(*hss), GFP_NOFS);
2155 	if (!hss) {
2156 		rc = -ENOMEM;
2157 		goto out;
2158 	}
2159 
2160 	hss->hss_valid = HSS_SETMASK | HSS_ARCHIVE_ID;
2161 	hss->hss_archive_id = hui->hui_archive_id;
2162 	hss->hss_setmask = HS_ARCHIVED | HS_EXISTS | HS_RELEASED;
2163 	rc = ll_hsm_state_set(inode, hss);
2164 	if (rc != 0)
2165 		goto out;
2166 
2167 	attr = kzalloc(sizeof(*attr), GFP_NOFS);
2168 	if (!attr) {
2169 		rc = -ENOMEM;
2170 		goto out;
2171 	}
2172 
2173 	attr->ia_mode = hui->hui_mode & (S_IRWXU | S_IRWXG | S_IRWXO);
2174 	attr->ia_mode |= S_IFREG;
2175 	attr->ia_uid = make_kuid(&init_user_ns, hui->hui_uid);
2176 	attr->ia_gid = make_kgid(&init_user_ns, hui->hui_gid);
2177 	attr->ia_size = hui->hui_size;
2178 	attr->ia_mtime.tv_sec = hui->hui_mtime;
2179 	attr->ia_mtime.tv_nsec = hui->hui_mtime_ns;
2180 	attr->ia_atime.tv_sec = hui->hui_atime;
2181 	attr->ia_atime.tv_nsec = hui->hui_atime_ns;
2182 
2183 	attr->ia_valid = ATTR_SIZE | ATTR_MODE | ATTR_FORCE |
2184 			 ATTR_UID | ATTR_GID |
2185 			 ATTR_MTIME | ATTR_MTIME_SET |
2186 			 ATTR_ATIME | ATTR_ATIME_SET;
2187 
2188 	mutex_lock(&inode->i_mutex);
2189 
2190 	rc = ll_setattr_raw(file->f_path.dentry, attr, true);
2191 	if (rc == -ENODATA)
2192 		rc = 0;
2193 
2194 	mutex_unlock(&inode->i_mutex);
2195 
2196 out:
2197 	if (hss != NULL)
2198 		OBD_FREE_PTR(hss);
2199 
2200 	if (attr != NULL)
2201 		OBD_FREE_PTR(attr);
2202 
2203 	return rc;
2204 }
2205 
2206 static long
ll_file_ioctl(struct file * file,unsigned int cmd,unsigned long arg)2207 ll_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
2208 {
2209 	struct inode		*inode = file_inode(file);
2210 	struct ll_file_data	*fd = LUSTRE_FPRIVATE(file);
2211 	int			 flags, rc;
2212 
2213 	CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),cmd=%x\n", inode->i_ino,
2214 	       inode->i_generation, inode, cmd);
2215 	ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_IOCTL, 1);
2216 
2217 	/* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */
2218 	if (_IOC_TYPE(cmd) == 'T' || _IOC_TYPE(cmd) == 't') /* tty ioctls */
2219 		return -ENOTTY;
2220 
2221 	switch (cmd) {
2222 	case LL_IOC_GETFLAGS:
2223 		/* Get the current value of the file flags */
2224 		return put_user(fd->fd_flags, (int *)arg);
2225 	case LL_IOC_SETFLAGS:
2226 	case LL_IOC_CLRFLAGS:
2227 		/* Set or clear specific file flags */
2228 		/* XXX This probably needs checks to ensure the flags are
2229 		 *     not abused, and to handle any flag side effects.
2230 		 */
2231 		if (get_user(flags, (int *) arg))
2232 			return -EFAULT;
2233 
2234 		if (cmd == LL_IOC_SETFLAGS) {
2235 			if ((flags & LL_FILE_IGNORE_LOCK) &&
2236 			    !(file->f_flags & O_DIRECT)) {
2237 				CERROR("%s: unable to disable locking on non-O_DIRECT file\n",
2238 				       current->comm);
2239 				return -EINVAL;
2240 			}
2241 
2242 			fd->fd_flags |= flags;
2243 		} else {
2244 			fd->fd_flags &= ~flags;
2245 		}
2246 		return 0;
2247 	case LL_IOC_LOV_SETSTRIPE:
2248 		return ll_lov_setstripe(inode, file, arg);
2249 	case LL_IOC_LOV_SETEA:
2250 		return ll_lov_setea(inode, file, arg);
2251 	case LL_IOC_LOV_SWAP_LAYOUTS: {
2252 		struct file *file2;
2253 		struct lustre_swap_layouts lsl;
2254 
2255 		if (copy_from_user(&lsl, (char *)arg,
2256 				       sizeof(struct lustre_swap_layouts)))
2257 			return -EFAULT;
2258 
2259 		if ((file->f_flags & O_ACCMODE) == 0) /* O_RDONLY */
2260 			return -EPERM;
2261 
2262 		file2 = fget(lsl.sl_fd);
2263 		if (file2 == NULL)
2264 			return -EBADF;
2265 
2266 		rc = -EPERM;
2267 		if ((file2->f_flags & O_ACCMODE) != 0) /* O_WRONLY or O_RDWR */
2268 			rc = ll_swap_layouts(file, file2, &lsl);
2269 		fput(file2);
2270 		return rc;
2271 	}
2272 	case LL_IOC_LOV_GETSTRIPE:
2273 		return ll_lov_getstripe(inode, arg);
2274 	case LL_IOC_RECREATE_OBJ:
2275 		return ll_lov_recreate_obj(inode, arg);
2276 	case LL_IOC_RECREATE_FID:
2277 		return ll_lov_recreate_fid(inode, arg);
2278 	case FSFILT_IOC_FIEMAP:
2279 		return ll_ioctl_fiemap(inode, arg);
2280 	case FSFILT_IOC_GETFLAGS:
2281 	case FSFILT_IOC_SETFLAGS:
2282 		return ll_iocontrol(inode, file, cmd, arg);
2283 	case FSFILT_IOC_GETVERSION_OLD:
2284 	case FSFILT_IOC_GETVERSION:
2285 		return put_user(inode->i_generation, (int *)arg);
2286 	case LL_IOC_GROUP_LOCK:
2287 		return ll_get_grouplock(inode, file, arg);
2288 	case LL_IOC_GROUP_UNLOCK:
2289 		return ll_put_grouplock(inode, file, arg);
2290 	case IOC_OBD_STATFS:
2291 		return ll_obd_statfs(inode, (void *)arg);
2292 
2293 	/* We need to special case any other ioctls we want to handle,
2294 	 * to send them to the MDS/OST as appropriate and to properly
2295 	 * network encode the arg field.
2296 	case FSFILT_IOC_SETVERSION_OLD:
2297 	case FSFILT_IOC_SETVERSION:
2298 	*/
2299 	case LL_IOC_FLUSHCTX:
2300 		return ll_flush_ctx(inode);
2301 	case LL_IOC_PATH2FID: {
2302 		if (copy_to_user((void *)arg, ll_inode2fid(inode),
2303 				 sizeof(struct lu_fid)))
2304 			return -EFAULT;
2305 
2306 		return 0;
2307 	}
2308 	case OBD_IOC_FID2PATH:
2309 		return ll_fid2path(inode, (void *)arg);
2310 	case LL_IOC_DATA_VERSION: {
2311 		struct ioc_data_version	idv;
2312 		int			rc;
2313 
2314 		if (copy_from_user(&idv, (char *)arg, sizeof(idv)))
2315 			return -EFAULT;
2316 
2317 		rc = ll_data_version(inode, &idv.idv_version,
2318 				!(idv.idv_flags & LL_DV_NOFLUSH));
2319 
2320 		if (rc == 0 && copy_to_user((char *) arg, &idv, sizeof(idv)))
2321 			return -EFAULT;
2322 
2323 		return rc;
2324 	}
2325 
2326 	case LL_IOC_GET_MDTIDX: {
2327 		int mdtidx;
2328 
2329 		mdtidx = ll_get_mdt_idx(inode);
2330 		if (mdtidx < 0)
2331 			return mdtidx;
2332 
2333 		if (put_user((int)mdtidx, (int *)arg))
2334 			return -EFAULT;
2335 
2336 		return 0;
2337 	}
2338 	case OBD_IOC_GETDTNAME:
2339 	case OBD_IOC_GETMDNAME:
2340 		return ll_get_obd_name(inode, cmd, arg);
2341 	case LL_IOC_HSM_STATE_GET: {
2342 		struct md_op_data	*op_data;
2343 		struct hsm_user_state	*hus;
2344 		int			 rc;
2345 
2346 		hus = kzalloc(sizeof(*hus), GFP_NOFS);
2347 		if (!hus)
2348 			return -ENOMEM;
2349 
2350 		op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2351 					     LUSTRE_OPC_ANY, hus);
2352 		if (IS_ERR(op_data)) {
2353 			OBD_FREE_PTR(hus);
2354 			return PTR_ERR(op_data);
2355 		}
2356 
2357 		rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
2358 				   op_data, NULL);
2359 
2360 		if (copy_to_user((void *)arg, hus, sizeof(*hus)))
2361 			rc = -EFAULT;
2362 
2363 		ll_finish_md_op_data(op_data);
2364 		OBD_FREE_PTR(hus);
2365 		return rc;
2366 	}
2367 	case LL_IOC_HSM_STATE_SET: {
2368 		struct hsm_state_set	*hss;
2369 		int			 rc;
2370 
2371 		hss = kzalloc(sizeof(*hss), GFP_NOFS);
2372 		if (!hss)
2373 			return -ENOMEM;
2374 
2375 		if (copy_from_user(hss, (char *)arg, sizeof(*hss))) {
2376 			OBD_FREE_PTR(hss);
2377 			return -EFAULT;
2378 		}
2379 
2380 		rc = ll_hsm_state_set(inode, hss);
2381 
2382 		OBD_FREE_PTR(hss);
2383 		return rc;
2384 	}
2385 	case LL_IOC_HSM_ACTION: {
2386 		struct md_op_data		*op_data;
2387 		struct hsm_current_action	*hca;
2388 		int				 rc;
2389 
2390 		hca = kzalloc(sizeof(*hca), GFP_NOFS);
2391 		if (!hca)
2392 			return -ENOMEM;
2393 
2394 		op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2395 					     LUSTRE_OPC_ANY, hca);
2396 		if (IS_ERR(op_data)) {
2397 			OBD_FREE_PTR(hca);
2398 			return PTR_ERR(op_data);
2399 		}
2400 
2401 		rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
2402 				   op_data, NULL);
2403 
2404 		if (copy_to_user((char *)arg, hca, sizeof(*hca)))
2405 			rc = -EFAULT;
2406 
2407 		ll_finish_md_op_data(op_data);
2408 		OBD_FREE_PTR(hca);
2409 		return rc;
2410 	}
2411 	case LL_IOC_SET_LEASE: {
2412 		struct ll_inode_info *lli = ll_i2info(inode);
2413 		struct obd_client_handle *och = NULL;
2414 		bool lease_broken;
2415 		fmode_t mode = 0;
2416 
2417 		switch (arg) {
2418 		case F_WRLCK:
2419 			if (!(file->f_mode & FMODE_WRITE))
2420 				return -EPERM;
2421 			mode = FMODE_WRITE;
2422 			break;
2423 		case F_RDLCK:
2424 			if (!(file->f_mode & FMODE_READ))
2425 				return -EPERM;
2426 			mode = FMODE_READ;
2427 			break;
2428 		case F_UNLCK:
2429 			mutex_lock(&lli->lli_och_mutex);
2430 			if (fd->fd_lease_och != NULL) {
2431 				och = fd->fd_lease_och;
2432 				fd->fd_lease_och = NULL;
2433 			}
2434 			mutex_unlock(&lli->lli_och_mutex);
2435 
2436 			if (och != NULL) {
2437 				mode = och->och_flags &
2438 				       (FMODE_READ|FMODE_WRITE);
2439 				rc = ll_lease_close(och, inode, &lease_broken);
2440 				if (rc == 0 && lease_broken)
2441 					mode = 0;
2442 			} else {
2443 				rc = -ENOLCK;
2444 			}
2445 
2446 			/* return the type of lease or error */
2447 			return rc < 0 ? rc : (int)mode;
2448 		default:
2449 			return -EINVAL;
2450 		}
2451 
2452 		CDEBUG(D_INODE, "Set lease with mode %d\n", mode);
2453 
2454 		/* apply for lease */
2455 		och = ll_lease_open(inode, file, mode, 0);
2456 		if (IS_ERR(och))
2457 			return PTR_ERR(och);
2458 
2459 		rc = 0;
2460 		mutex_lock(&lli->lli_och_mutex);
2461 		if (fd->fd_lease_och == NULL) {
2462 			fd->fd_lease_och = och;
2463 			och = NULL;
2464 		}
2465 		mutex_unlock(&lli->lli_och_mutex);
2466 		if (och != NULL) {
2467 			/* impossible now that only excl is supported for now */
2468 			ll_lease_close(och, inode, &lease_broken);
2469 			rc = -EBUSY;
2470 		}
2471 		return rc;
2472 	}
2473 	case LL_IOC_GET_LEASE: {
2474 		struct ll_inode_info *lli = ll_i2info(inode);
2475 		struct ldlm_lock *lock = NULL;
2476 
2477 		rc = 0;
2478 		mutex_lock(&lli->lli_och_mutex);
2479 		if (fd->fd_lease_och != NULL) {
2480 			struct obd_client_handle *och = fd->fd_lease_och;
2481 
2482 			lock = ldlm_handle2lock(&och->och_lease_handle);
2483 			if (lock != NULL) {
2484 				lock_res_and_lock(lock);
2485 				if (!ldlm_is_cancel(lock))
2486 					rc = och->och_flags &
2487 						(FMODE_READ | FMODE_WRITE);
2488 				unlock_res_and_lock(lock);
2489 				ldlm_lock_put(lock);
2490 			}
2491 		}
2492 		mutex_unlock(&lli->lli_och_mutex);
2493 		return rc;
2494 	}
2495 	case LL_IOC_HSM_IMPORT: {
2496 		struct hsm_user_import *hui;
2497 
2498 		hui = kzalloc(sizeof(*hui), GFP_NOFS);
2499 		if (!hui)
2500 			return -ENOMEM;
2501 
2502 		if (copy_from_user(hui, (void *)arg, sizeof(*hui))) {
2503 			OBD_FREE_PTR(hui);
2504 			return -EFAULT;
2505 		}
2506 
2507 		rc = ll_hsm_import(inode, file, hui);
2508 
2509 		OBD_FREE_PTR(hui);
2510 		return rc;
2511 	}
2512 	default: {
2513 		int err;
2514 
2515 		if (LLIOC_STOP ==
2516 		     ll_iocontrol_call(inode, file, cmd, arg, &err))
2517 			return err;
2518 
2519 		return obd_iocontrol(cmd, ll_i2dtexp(inode), 0, NULL,
2520 				     (void *)arg);
2521 	}
2522 	}
2523 }
2524 
2525 
ll_file_seek(struct file * file,loff_t offset,int origin)2526 static loff_t ll_file_seek(struct file *file, loff_t offset, int origin)
2527 {
2528 	struct inode *inode = file_inode(file);
2529 	loff_t retval, eof = 0;
2530 
2531 	retval = offset + ((origin == SEEK_END) ? i_size_read(inode) :
2532 			   (origin == SEEK_CUR) ? file->f_pos : 0);
2533 	CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), to=%llu=%#llx(%d)\n",
2534 	       inode->i_ino, inode->i_generation, inode, retval, retval,
2535 	       origin);
2536 	ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_LLSEEK, 1);
2537 
2538 	if (origin == SEEK_END || origin == SEEK_HOLE || origin == SEEK_DATA) {
2539 		retval = ll_glimpse_size(inode);
2540 		if (retval != 0)
2541 			return retval;
2542 		eof = i_size_read(inode);
2543 	}
2544 
2545 	retval = generic_file_llseek_size(file, offset, origin,
2546 					  ll_file_maxbytes(inode), eof);
2547 	return retval;
2548 }
2549 
ll_flush(struct file * file,fl_owner_t id)2550 static int ll_flush(struct file *file, fl_owner_t id)
2551 {
2552 	struct inode *inode = file_inode(file);
2553 	struct ll_inode_info *lli = ll_i2info(inode);
2554 	struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2555 	int rc, err;
2556 
2557 	LASSERT(!S_ISDIR(inode->i_mode));
2558 
2559 	/* catch async errors that were recorded back when async writeback
2560 	 * failed for pages in this mapping. */
2561 	rc = lli->lli_async_rc;
2562 	lli->lli_async_rc = 0;
2563 	err = lov_read_and_clear_async_rc(lli->lli_clob);
2564 	if (rc == 0)
2565 		rc = err;
2566 
2567 	/* The application has been told write failure already.
2568 	 * Do not report failure again. */
2569 	if (fd->fd_write_failed)
2570 		return 0;
2571 	return rc ? -EIO : 0;
2572 }
2573 
2574 /**
2575  * Called to make sure a portion of file has been written out.
2576  * if @mode is not CL_FSYNC_LOCAL, it will send OST_SYNC RPCs to OST.
2577  *
2578  * Return how many pages have been written.
2579  */
cl_sync_file_range(struct inode * inode,loff_t start,loff_t end,enum cl_fsync_mode mode,int ignore_layout)2580 int cl_sync_file_range(struct inode *inode, loff_t start, loff_t end,
2581 		       enum cl_fsync_mode mode, int ignore_layout)
2582 {
2583 	struct cl_env_nest nest;
2584 	struct lu_env *env;
2585 	struct cl_io *io;
2586 	struct obd_capa *capa = NULL;
2587 	struct cl_fsync_io *fio;
2588 	int result;
2589 
2590 	if (mode != CL_FSYNC_NONE && mode != CL_FSYNC_LOCAL &&
2591 	    mode != CL_FSYNC_DISCARD && mode != CL_FSYNC_ALL)
2592 		return -EINVAL;
2593 
2594 	env = cl_env_nested_get(&nest);
2595 	if (IS_ERR(env))
2596 		return PTR_ERR(env);
2597 
2598 	capa = ll_osscapa_get(inode, CAPA_OPC_OSS_WRITE);
2599 
2600 	io = ccc_env_thread_io(env);
2601 	io->ci_obj = cl_i2info(inode)->lli_clob;
2602 	io->ci_ignore_layout = ignore_layout;
2603 
2604 	/* initialize parameters for sync */
2605 	fio = &io->u.ci_fsync;
2606 	fio->fi_capa = capa;
2607 	fio->fi_start = start;
2608 	fio->fi_end = end;
2609 	fio->fi_fid = ll_inode2fid(inode);
2610 	fio->fi_mode = mode;
2611 	fio->fi_nr_written = 0;
2612 
2613 	if (cl_io_init(env, io, CIT_FSYNC, io->ci_obj) == 0)
2614 		result = cl_io_loop(env, io);
2615 	else
2616 		result = io->ci_result;
2617 	if (result == 0)
2618 		result = fio->fi_nr_written;
2619 	cl_io_fini(env, io);
2620 	cl_env_nested_put(&nest, env);
2621 
2622 	capa_put(capa);
2623 
2624 	return result;
2625 }
2626 
ll_fsync(struct file * file,loff_t start,loff_t end,int datasync)2627 int ll_fsync(struct file *file, loff_t start, loff_t end, int datasync)
2628 {
2629 	struct inode *inode = file_inode(file);
2630 	struct ll_inode_info *lli = ll_i2info(inode);
2631 	struct ptlrpc_request *req;
2632 	struct obd_capa *oc;
2633 	int rc, err;
2634 
2635 	CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n", inode->i_ino,
2636 	       inode->i_generation, inode);
2637 	ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FSYNC, 1);
2638 
2639 	rc = filemap_write_and_wait_range(inode->i_mapping, start, end);
2640 	mutex_lock(&inode->i_mutex);
2641 
2642 	/* catch async errors that were recorded back when async writeback
2643 	 * failed for pages in this mapping. */
2644 	if (!S_ISDIR(inode->i_mode)) {
2645 		err = lli->lli_async_rc;
2646 		lli->lli_async_rc = 0;
2647 		if (rc == 0)
2648 			rc = err;
2649 		err = lov_read_and_clear_async_rc(lli->lli_clob);
2650 		if (rc == 0)
2651 			rc = err;
2652 	}
2653 
2654 	oc = ll_mdscapa_get(inode);
2655 	err = md_sync(ll_i2sbi(inode)->ll_md_exp, ll_inode2fid(inode), oc,
2656 		      &req);
2657 	capa_put(oc);
2658 	if (!rc)
2659 		rc = err;
2660 	if (!err)
2661 		ptlrpc_req_finished(req);
2662 
2663 	if (S_ISREG(inode->i_mode)) {
2664 		struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2665 
2666 		err = cl_sync_file_range(inode, start, end, CL_FSYNC_ALL, 0);
2667 		if (rc == 0 && err < 0)
2668 			rc = err;
2669 		if (rc < 0)
2670 			fd->fd_write_failed = true;
2671 		else
2672 			fd->fd_write_failed = false;
2673 	}
2674 
2675 	mutex_unlock(&inode->i_mutex);
2676 	return rc;
2677 }
2678 
2679 static int
ll_file_flock(struct file * file,int cmd,struct file_lock * file_lock)2680 ll_file_flock(struct file *file, int cmd, struct file_lock *file_lock)
2681 {
2682 	struct inode *inode = file_inode(file);
2683 	struct ll_sb_info *sbi = ll_i2sbi(inode);
2684 	struct ldlm_enqueue_info einfo = {
2685 		.ei_type	= LDLM_FLOCK,
2686 		.ei_cb_cp	= ldlm_flock_completion_ast,
2687 		.ei_cbdata	= file_lock,
2688 	};
2689 	struct md_op_data *op_data;
2690 	struct lustre_handle lockh = {0};
2691 	ldlm_policy_data_t flock = {{0}};
2692 	__u64 flags = 0;
2693 	int rc;
2694 	int rc2 = 0;
2695 
2696 	CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu file_lock=%p\n",
2697 	       inode->i_ino, file_lock);
2698 
2699 	ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FLOCK, 1);
2700 
2701 	if (file_lock->fl_flags & FL_FLOCK)
2702 		LASSERT((cmd == F_SETLKW) || (cmd == F_SETLK));
2703 	else if (!(file_lock->fl_flags & FL_POSIX))
2704 		return -EINVAL;
2705 
2706 	flock.l_flock.owner = (unsigned long)file_lock->fl_owner;
2707 	flock.l_flock.pid = file_lock->fl_pid;
2708 	flock.l_flock.start = file_lock->fl_start;
2709 	flock.l_flock.end = file_lock->fl_end;
2710 
2711 	/* Somewhat ugly workaround for svc lockd.
2712 	 * lockd installs custom fl_lmops->lm_compare_owner that checks
2713 	 * for the fl_owner to be the same (which it always is on local node
2714 	 * I guess between lockd processes) and then compares pid.
2715 	 * As such we assign pid to the owner field to make it all work,
2716 	 * conflict with normal locks is unlikely since pid space and
2717 	 * pointer space for current->files are not intersecting */
2718 	if (file_lock->fl_lmops && file_lock->fl_lmops->lm_compare_owner)
2719 		flock.l_flock.owner = (unsigned long)file_lock->fl_pid;
2720 
2721 	switch (file_lock->fl_type) {
2722 	case F_RDLCK:
2723 		einfo.ei_mode = LCK_PR;
2724 		break;
2725 	case F_UNLCK:
2726 		/* An unlock request may or may not have any relation to
2727 		 * existing locks so we may not be able to pass a lock handle
2728 		 * via a normal ldlm_lock_cancel() request. The request may even
2729 		 * unlock a byte range in the middle of an existing lock. In
2730 		 * order to process an unlock request we need all of the same
2731 		 * information that is given with a normal read or write record
2732 		 * lock request. To avoid creating another ldlm unlock (cancel)
2733 		 * message we'll treat a LCK_NL flock request as an unlock. */
2734 		einfo.ei_mode = LCK_NL;
2735 		break;
2736 	case F_WRLCK:
2737 		einfo.ei_mode = LCK_PW;
2738 		break;
2739 	default:
2740 		CDEBUG(D_INFO, "Unknown fcntl lock type: %d\n",
2741 			file_lock->fl_type);
2742 		return -ENOTSUPP;
2743 	}
2744 
2745 	switch (cmd) {
2746 	case F_SETLKW:
2747 #ifdef F_SETLKW64
2748 	case F_SETLKW64:
2749 #endif
2750 		flags = 0;
2751 		break;
2752 	case F_SETLK:
2753 #ifdef F_SETLK64
2754 	case F_SETLK64:
2755 #endif
2756 		flags = LDLM_FL_BLOCK_NOWAIT;
2757 		break;
2758 	case F_GETLK:
2759 #ifdef F_GETLK64
2760 	case F_GETLK64:
2761 #endif
2762 		flags = LDLM_FL_TEST_LOCK;
2763 		/* Save the old mode so that if the mode in the lock changes we
2764 		 * can decrement the appropriate reader or writer refcount. */
2765 		file_lock->fl_type = einfo.ei_mode;
2766 		break;
2767 	default:
2768 		CERROR("unknown fcntl lock command: %d\n", cmd);
2769 		return -EINVAL;
2770 	}
2771 
2772 	op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2773 				     LUSTRE_OPC_ANY, NULL);
2774 	if (IS_ERR(op_data))
2775 		return PTR_ERR(op_data);
2776 
2777 	CDEBUG(D_DLMTRACE, "inode=%lu, pid=%u, flags=%#llx, mode=%u, start=%llu, end=%llu\n",
2778 	       inode->i_ino, flock.l_flock.pid, flags, einfo.ei_mode,
2779 	       flock.l_flock.start, flock.l_flock.end);
2780 
2781 	rc = md_enqueue(sbi->ll_md_exp, &einfo, NULL,
2782 			op_data, &lockh, &flock, 0, NULL /* req */, flags);
2783 
2784 	if ((file_lock->fl_flags & FL_FLOCK) &&
2785 	    (rc == 0 || file_lock->fl_type == F_UNLCK))
2786 		rc2  = flock_lock_file_wait(file, file_lock);
2787 	if ((file_lock->fl_flags & FL_POSIX) &&
2788 	    (rc == 0 || file_lock->fl_type == F_UNLCK) &&
2789 	    !(flags & LDLM_FL_TEST_LOCK))
2790 		rc2  = posix_lock_file_wait(file, file_lock);
2791 
2792 	if (rc2 && file_lock->fl_type != F_UNLCK) {
2793 		einfo.ei_mode = LCK_NL;
2794 		md_enqueue(sbi->ll_md_exp, &einfo, NULL,
2795 			op_data, &lockh, &flock, 0, NULL /* req */, flags);
2796 		rc = rc2;
2797 	}
2798 
2799 	ll_finish_md_op_data(op_data);
2800 
2801 	return rc;
2802 }
2803 
2804 static int
ll_file_noflock(struct file * file,int cmd,struct file_lock * file_lock)2805 ll_file_noflock(struct file *file, int cmd, struct file_lock *file_lock)
2806 {
2807 	return -ENOSYS;
2808 }
2809 
2810 /**
2811  * test if some locks matching bits and l_req_mode are acquired
2812  * - bits can be in different locks
2813  * - if found clear the common lock bits in *bits
2814  * - the bits not found, are kept in *bits
2815  * \param inode [IN]
2816  * \param bits [IN] searched lock bits [IN]
2817  * \param l_req_mode [IN] searched lock mode
2818  * \retval boolean, true iff all bits are found
2819  */
ll_have_md_lock(struct inode * inode,__u64 * bits,ldlm_mode_t l_req_mode)2820 int ll_have_md_lock(struct inode *inode, __u64 *bits,  ldlm_mode_t l_req_mode)
2821 {
2822 	struct lustre_handle lockh;
2823 	ldlm_policy_data_t policy;
2824 	ldlm_mode_t mode = (l_req_mode == LCK_MINMODE) ?
2825 				(LCK_CR|LCK_CW|LCK_PR|LCK_PW) : l_req_mode;
2826 	struct lu_fid *fid;
2827 	__u64 flags;
2828 	int i;
2829 
2830 	if (!inode)
2831 		return 0;
2832 
2833 	fid = &ll_i2info(inode)->lli_fid;
2834 	CDEBUG(D_INFO, "trying to match res "DFID" mode %s\n", PFID(fid),
2835 	       ldlm_lockname[mode]);
2836 
2837 	flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING | LDLM_FL_TEST_LOCK;
2838 	for (i = 0; i <= MDS_INODELOCK_MAXSHIFT && *bits != 0; i++) {
2839 		policy.l_inodebits.bits = *bits & (1 << i);
2840 		if (policy.l_inodebits.bits == 0)
2841 			continue;
2842 
2843 		if (md_lock_match(ll_i2mdexp(inode), flags, fid, LDLM_IBITS,
2844 				  &policy, mode, &lockh)) {
2845 			struct ldlm_lock *lock;
2846 
2847 			lock = ldlm_handle2lock(&lockh);
2848 			if (lock) {
2849 				*bits &=
2850 				      ~(lock->l_policy_data.l_inodebits.bits);
2851 				LDLM_LOCK_PUT(lock);
2852 			} else {
2853 				*bits &= ~policy.l_inodebits.bits;
2854 			}
2855 		}
2856 	}
2857 	return *bits == 0;
2858 }
2859 
ll_take_md_lock(struct inode * inode,__u64 bits,struct lustre_handle * lockh,__u64 flags,ldlm_mode_t mode)2860 ldlm_mode_t ll_take_md_lock(struct inode *inode, __u64 bits,
2861 			    struct lustre_handle *lockh, __u64 flags,
2862 			    ldlm_mode_t mode)
2863 {
2864 	ldlm_policy_data_t policy = { .l_inodebits = {bits} };
2865 	struct lu_fid *fid;
2866 	ldlm_mode_t rc;
2867 
2868 	fid = &ll_i2info(inode)->lli_fid;
2869 	CDEBUG(D_INFO, "trying to match res "DFID"\n", PFID(fid));
2870 
2871 	rc = md_lock_match(ll_i2mdexp(inode), LDLM_FL_BLOCK_GRANTED|flags,
2872 			   fid, LDLM_IBITS, &policy, mode, lockh);
2873 
2874 	return rc;
2875 }
2876 
ll_inode_revalidate_fini(struct inode * inode,int rc)2877 static int ll_inode_revalidate_fini(struct inode *inode, int rc)
2878 {
2879 	/* Already unlinked. Just update nlink and return success */
2880 	if (rc == -ENOENT) {
2881 		clear_nlink(inode);
2882 		/* This path cannot be hit for regular files unless in
2883 		 * case of obscure races, so no need to validate size.
2884 		 */
2885 		if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode))
2886 			return 0;
2887 	} else if (rc != 0) {
2888 		CDEBUG_LIMIT((rc == -EACCES || rc == -EIDRM) ? D_INFO : D_ERROR,
2889 			     "%s: revalidate FID "DFID" error: rc = %d\n",
2890 			     ll_get_fsname(inode->i_sb, NULL, 0),
2891 			     PFID(ll_inode2fid(inode)), rc);
2892 	}
2893 
2894 	return rc;
2895 }
2896 
__ll_inode_revalidate(struct dentry * dentry,__u64 ibits)2897 static int __ll_inode_revalidate(struct dentry *dentry, __u64 ibits)
2898 {
2899 	struct inode *inode = d_inode(dentry);
2900 	struct ptlrpc_request *req = NULL;
2901 	struct obd_export *exp;
2902 	int rc = 0;
2903 
2904 	LASSERT(inode != NULL);
2905 
2906 	CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),name=%pd\n",
2907 	       inode->i_ino, inode->i_generation, inode, dentry);
2908 
2909 	exp = ll_i2mdexp(inode);
2910 
2911 	/* XXX: Enable OBD_CONNECT_ATTRFID to reduce unnecessary getattr RPC.
2912 	 *      But under CMD case, it caused some lock issues, should be fixed
2913 	 *      with new CMD ibits lock. See bug 12718 */
2914 	if (exp_connect_flags(exp) & OBD_CONNECT_ATTRFID) {
2915 		struct lookup_intent oit = { .it_op = IT_GETATTR };
2916 		struct md_op_data *op_data;
2917 
2918 		if (ibits == MDS_INODELOCK_LOOKUP)
2919 			oit.it_op = IT_LOOKUP;
2920 
2921 		/* Call getattr by fid, so do not provide name at all. */
2922 		op_data = ll_prep_md_op_data(NULL, inode,
2923 					     inode, NULL, 0, 0,
2924 					     LUSTRE_OPC_ANY, NULL);
2925 		if (IS_ERR(op_data))
2926 			return PTR_ERR(op_data);
2927 
2928 		oit.it_create_mode |= M_CHECK_STALE;
2929 		rc = md_intent_lock(exp, op_data, NULL, 0,
2930 				    /* we are not interested in name
2931 				       based lookup */
2932 				    &oit, 0, &req,
2933 				    ll_md_blocking_ast, 0);
2934 		ll_finish_md_op_data(op_data);
2935 		oit.it_create_mode &= ~M_CHECK_STALE;
2936 		if (rc < 0) {
2937 			rc = ll_inode_revalidate_fini(inode, rc);
2938 			goto out;
2939 		}
2940 
2941 		rc = ll_revalidate_it_finish(req, &oit, inode);
2942 		if (rc != 0) {
2943 			ll_intent_release(&oit);
2944 			goto out;
2945 		}
2946 
2947 		/* Unlinked? Unhash dentry, so it is not picked up later by
2948 		   do_lookup() -> ll_revalidate_it(). We cannot use d_drop
2949 		   here to preserve get_cwd functionality on 2.6.
2950 		   Bug 10503 */
2951 		if (!d_inode(dentry)->i_nlink)
2952 			d_lustre_invalidate(dentry, 0);
2953 
2954 		ll_lookup_finish_locks(&oit, inode);
2955 	} else if (!ll_have_md_lock(d_inode(dentry), &ibits, LCK_MINMODE)) {
2956 		struct ll_sb_info *sbi = ll_i2sbi(d_inode(dentry));
2957 		u64 valid = OBD_MD_FLGETATTR;
2958 		struct md_op_data *op_data;
2959 		int ealen = 0;
2960 
2961 		if (S_ISREG(inode->i_mode)) {
2962 			rc = ll_get_default_mdsize(sbi, &ealen);
2963 			if (rc)
2964 				return rc;
2965 			valid |= OBD_MD_FLEASIZE | OBD_MD_FLMODEASIZE;
2966 		}
2967 
2968 		op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL,
2969 					     0, ealen, LUSTRE_OPC_ANY,
2970 					     NULL);
2971 		if (IS_ERR(op_data))
2972 			return PTR_ERR(op_data);
2973 
2974 		op_data->op_valid = valid;
2975 		/* Once OBD_CONNECT_ATTRFID is not supported, we can't find one
2976 		 * capa for this inode. Because we only keep capas of dirs
2977 		 * fresh. */
2978 		rc = md_getattr(sbi->ll_md_exp, op_data, &req);
2979 		ll_finish_md_op_data(op_data);
2980 		if (rc) {
2981 			rc = ll_inode_revalidate_fini(inode, rc);
2982 			return rc;
2983 		}
2984 
2985 		rc = ll_prep_inode(&inode, req, NULL, NULL);
2986 	}
2987 out:
2988 	ptlrpc_req_finished(req);
2989 	return rc;
2990 }
2991 
ll_inode_revalidate(struct dentry * dentry,__u64 ibits)2992 static int ll_inode_revalidate(struct dentry *dentry, __u64 ibits)
2993 {
2994 	struct inode *inode = d_inode(dentry);
2995 	int rc;
2996 
2997 	rc = __ll_inode_revalidate(dentry, ibits);
2998 	if (rc != 0)
2999 		return rc;
3000 
3001 	/* if object isn't regular file, don't validate size */
3002 	if (!S_ISREG(inode->i_mode)) {
3003 		LTIME_S(inode->i_atime) = ll_i2info(inode)->lli_lvb.lvb_atime;
3004 		LTIME_S(inode->i_mtime) = ll_i2info(inode)->lli_lvb.lvb_mtime;
3005 		LTIME_S(inode->i_ctime) = ll_i2info(inode)->lli_lvb.lvb_ctime;
3006 	} else {
3007 		/* In case of restore, the MDT has the right size and has
3008 		 * already send it back without granting the layout lock,
3009 		 * inode is up-to-date so glimpse is useless.
3010 		 * Also to glimpse we need the layout, in case of a running
3011 		 * restore the MDT holds the layout lock so the glimpse will
3012 		 * block up to the end of restore (getattr will block)
3013 		 */
3014 		if (!(ll_i2info(inode)->lli_flags & LLIF_FILE_RESTORING))
3015 			rc = ll_glimpse_size(inode);
3016 	}
3017 	return rc;
3018 }
3019 
ll_getattr(struct vfsmount * mnt,struct dentry * de,struct kstat * stat)3020 int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat)
3021 {
3022 	struct inode *inode = d_inode(de);
3023 	struct ll_sb_info *sbi = ll_i2sbi(inode);
3024 	struct ll_inode_info *lli = ll_i2info(inode);
3025 	int res = 0;
3026 
3027 	res = ll_inode_revalidate(de, MDS_INODELOCK_UPDATE |
3028 				      MDS_INODELOCK_LOOKUP);
3029 	ll_stats_ops_tally(sbi, LPROC_LL_GETATTR, 1);
3030 
3031 	if (res)
3032 		return res;
3033 
3034 	stat->dev = inode->i_sb->s_dev;
3035 	if (ll_need_32bit_api(sbi))
3036 		stat->ino = cl_fid_build_ino(&lli->lli_fid, 1);
3037 	else
3038 		stat->ino = inode->i_ino;
3039 	stat->mode = inode->i_mode;
3040 	stat->nlink = inode->i_nlink;
3041 	stat->uid = inode->i_uid;
3042 	stat->gid = inode->i_gid;
3043 	stat->rdev = inode->i_rdev;
3044 	stat->atime = inode->i_atime;
3045 	stat->mtime = inode->i_mtime;
3046 	stat->ctime = inode->i_ctime;
3047 	stat->blksize = 1 << inode->i_blkbits;
3048 
3049 	stat->size = i_size_read(inode);
3050 	stat->blocks = inode->i_blocks;
3051 
3052 	return 0;
3053 }
3054 
ll_fiemap(struct inode * inode,struct fiemap_extent_info * fieinfo,__u64 start,__u64 len)3055 static int ll_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
3056 		     __u64 start, __u64 len)
3057 {
3058 	int rc;
3059 	size_t num_bytes;
3060 	struct ll_user_fiemap *fiemap;
3061 	unsigned int extent_count = fieinfo->fi_extents_max;
3062 
3063 	num_bytes = sizeof(*fiemap) + (extent_count *
3064 				       sizeof(struct ll_fiemap_extent));
3065 	OBD_ALLOC_LARGE(fiemap, num_bytes);
3066 
3067 	if (fiemap == NULL)
3068 		return -ENOMEM;
3069 
3070 	fiemap->fm_flags = fieinfo->fi_flags;
3071 	fiemap->fm_extent_count = fieinfo->fi_extents_max;
3072 	fiemap->fm_start = start;
3073 	fiemap->fm_length = len;
3074 	if (extent_count > 0)
3075 		memcpy(&fiemap->fm_extents[0], fieinfo->fi_extents_start,
3076 		       sizeof(struct ll_fiemap_extent));
3077 
3078 	rc = ll_do_fiemap(inode, fiemap, num_bytes);
3079 
3080 	fieinfo->fi_flags = fiemap->fm_flags;
3081 	fieinfo->fi_extents_mapped = fiemap->fm_mapped_extents;
3082 	if (extent_count > 0)
3083 		memcpy(fieinfo->fi_extents_start, &fiemap->fm_extents[0],
3084 		       fiemap->fm_mapped_extents *
3085 		       sizeof(struct ll_fiemap_extent));
3086 
3087 	OBD_FREE_LARGE(fiemap, num_bytes);
3088 	return rc;
3089 }
3090 
ll_get_acl(struct inode * inode,int type)3091 struct posix_acl *ll_get_acl(struct inode *inode, int type)
3092 {
3093 	struct ll_inode_info *lli = ll_i2info(inode);
3094 	struct posix_acl *acl = NULL;
3095 
3096 	spin_lock(&lli->lli_lock);
3097 	/* VFS' acl_permission_check->check_acl will release the refcount */
3098 	acl = posix_acl_dup(lli->lli_posix_acl);
3099 	spin_unlock(&lli->lli_lock);
3100 
3101 	return acl;
3102 }
3103 
3104 
ll_inode_permission(struct inode * inode,int mask)3105 int ll_inode_permission(struct inode *inode, int mask)
3106 {
3107 	int rc = 0;
3108 
3109 #ifdef MAY_NOT_BLOCK
3110 	if (mask & MAY_NOT_BLOCK)
3111 		return -ECHILD;
3112 #endif
3113 
3114        /* as root inode are NOT getting validated in lookup operation,
3115 	* need to do it before permission check. */
3116 
3117 	if (is_root_inode(inode)) {
3118 		rc = __ll_inode_revalidate(inode->i_sb->s_root,
3119 					   MDS_INODELOCK_LOOKUP);
3120 		if (rc)
3121 			return rc;
3122 	}
3123 
3124 	CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), inode mode %x mask %o\n",
3125 	       inode->i_ino, inode->i_generation, inode, inode->i_mode, mask);
3126 
3127 	if (ll_i2sbi(inode)->ll_flags & LL_SBI_RMT_CLIENT)
3128 		return lustre_check_remote_perm(inode, mask);
3129 
3130 	ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_INODE_PERM, 1);
3131 	rc = generic_permission(inode, mask);
3132 
3133 	return rc;
3134 }
3135 
3136 /* -o localflock - only provides locally consistent flock locks */
3137 struct file_operations ll_file_operations = {
3138 	.read_iter = ll_file_read_iter,
3139 	.write_iter = ll_file_write_iter,
3140 	.unlocked_ioctl = ll_file_ioctl,
3141 	.open	   = ll_file_open,
3142 	.release	= ll_file_release,
3143 	.mmap	   = ll_file_mmap,
3144 	.llseek	 = ll_file_seek,
3145 	.splice_read    = ll_file_splice_read,
3146 	.fsync	  = ll_fsync,
3147 	.flush	  = ll_flush
3148 };
3149 
3150 struct file_operations ll_file_operations_flock = {
3151 	.read_iter    = ll_file_read_iter,
3152 	.write_iter   = ll_file_write_iter,
3153 	.unlocked_ioctl = ll_file_ioctl,
3154 	.open	   = ll_file_open,
3155 	.release	= ll_file_release,
3156 	.mmap	   = ll_file_mmap,
3157 	.llseek	 = ll_file_seek,
3158 	.splice_read    = ll_file_splice_read,
3159 	.fsync	  = ll_fsync,
3160 	.flush	  = ll_flush,
3161 	.flock	  = ll_file_flock,
3162 	.lock	   = ll_file_flock
3163 };
3164 
3165 /* These are for -o noflock - to return ENOSYS on flock calls */
3166 struct file_operations ll_file_operations_noflock = {
3167 	.read_iter    = ll_file_read_iter,
3168 	.write_iter   = ll_file_write_iter,
3169 	.unlocked_ioctl = ll_file_ioctl,
3170 	.open	   = ll_file_open,
3171 	.release	= ll_file_release,
3172 	.mmap	   = ll_file_mmap,
3173 	.llseek	 = ll_file_seek,
3174 	.splice_read    = ll_file_splice_read,
3175 	.fsync	  = ll_fsync,
3176 	.flush	  = ll_flush,
3177 	.flock	  = ll_file_noflock,
3178 	.lock	   = ll_file_noflock
3179 };
3180 
3181 struct inode_operations ll_file_inode_operations = {
3182 	.setattr	= ll_setattr,
3183 	.getattr	= ll_getattr,
3184 	.permission	= ll_inode_permission,
3185 	.setxattr	= ll_setxattr,
3186 	.getxattr	= ll_getxattr,
3187 	.listxattr	= ll_listxattr,
3188 	.removexattr	= ll_removexattr,
3189 	.fiemap		= ll_fiemap,
3190 	.get_acl	= ll_get_acl,
3191 };
3192 
3193 /* dynamic ioctl number support routines */
3194 static struct llioc_ctl_data {
3195 	struct rw_semaphore	ioc_sem;
3196 	struct list_head	      ioc_head;
3197 } llioc = {
3198 	__RWSEM_INITIALIZER(llioc.ioc_sem),
3199 	LIST_HEAD_INIT(llioc.ioc_head)
3200 };
3201 
3202 
3203 struct llioc_data {
3204 	struct list_head	      iocd_list;
3205 	unsigned int	    iocd_size;
3206 	llioc_callback_t	iocd_cb;
3207 	unsigned int	    iocd_count;
3208 	unsigned int	    iocd_cmd[0];
3209 };
3210 
ll_iocontrol_register(llioc_callback_t cb,int count,unsigned int * cmd)3211 void *ll_iocontrol_register(llioc_callback_t cb, int count, unsigned int *cmd)
3212 {
3213 	unsigned int size;
3214 	struct llioc_data *in_data = NULL;
3215 
3216 	if (cb == NULL || cmd == NULL ||
3217 	    count > LLIOC_MAX_CMD || count < 0)
3218 		return NULL;
3219 
3220 	size = sizeof(*in_data) + count * sizeof(unsigned int);
3221 	in_data = kzalloc(size, GFP_NOFS);
3222 	if (!in_data)
3223 		return NULL;
3224 
3225 	memset(in_data, 0, sizeof(*in_data));
3226 	in_data->iocd_size = size;
3227 	in_data->iocd_cb = cb;
3228 	in_data->iocd_count = count;
3229 	memcpy(in_data->iocd_cmd, cmd, sizeof(unsigned int) * count);
3230 
3231 	down_write(&llioc.ioc_sem);
3232 	list_add_tail(&in_data->iocd_list, &llioc.ioc_head);
3233 	up_write(&llioc.ioc_sem);
3234 
3235 	return in_data;
3236 }
3237 EXPORT_SYMBOL(ll_iocontrol_register);
3238 
ll_iocontrol_unregister(void * magic)3239 void ll_iocontrol_unregister(void *magic)
3240 {
3241 	struct llioc_data *tmp;
3242 
3243 	if (magic == NULL)
3244 		return;
3245 
3246 	down_write(&llioc.ioc_sem);
3247 	list_for_each_entry(tmp, &llioc.ioc_head, iocd_list) {
3248 		if (tmp == magic) {
3249 			unsigned int size = tmp->iocd_size;
3250 
3251 			list_del(&tmp->iocd_list);
3252 			up_write(&llioc.ioc_sem);
3253 
3254 			OBD_FREE(tmp, size);
3255 			return;
3256 		}
3257 	}
3258 	up_write(&llioc.ioc_sem);
3259 
3260 	CWARN("didn't find iocontrol register block with magic: %p\n", magic);
3261 }
3262 EXPORT_SYMBOL(ll_iocontrol_unregister);
3263 
3264 static enum llioc_iter
ll_iocontrol_call(struct inode * inode,struct file * file,unsigned int cmd,unsigned long arg,int * rcp)3265 ll_iocontrol_call(struct inode *inode, struct file *file,
3266 		  unsigned int cmd, unsigned long arg, int *rcp)
3267 {
3268 	enum llioc_iter ret = LLIOC_CONT;
3269 	struct llioc_data *data;
3270 	int rc = -EINVAL, i;
3271 
3272 	down_read(&llioc.ioc_sem);
3273 	list_for_each_entry(data, &llioc.ioc_head, iocd_list) {
3274 		for (i = 0; i < data->iocd_count; i++) {
3275 			if (cmd != data->iocd_cmd[i])
3276 				continue;
3277 
3278 			ret = data->iocd_cb(inode, file, cmd, arg, data, &rc);
3279 			break;
3280 		}
3281 
3282 		if (ret == LLIOC_STOP)
3283 			break;
3284 	}
3285 	up_read(&llioc.ioc_sem);
3286 
3287 	if (rcp)
3288 		*rcp = rc;
3289 	return ret;
3290 }
3291 
ll_layout_conf(struct inode * inode,const struct cl_object_conf * conf)3292 int ll_layout_conf(struct inode *inode, const struct cl_object_conf *conf)
3293 {
3294 	struct ll_inode_info *lli = ll_i2info(inode);
3295 	struct cl_env_nest nest;
3296 	struct lu_env *env;
3297 	int result;
3298 
3299 	if (lli->lli_clob == NULL)
3300 		return 0;
3301 
3302 	env = cl_env_nested_get(&nest);
3303 	if (IS_ERR(env))
3304 		return PTR_ERR(env);
3305 
3306 	result = cl_conf_set(env, lli->lli_clob, conf);
3307 	cl_env_nested_put(&nest, env);
3308 
3309 	if (conf->coc_opc == OBJECT_CONF_SET) {
3310 		struct ldlm_lock *lock = conf->coc_lock;
3311 
3312 		LASSERT(lock != NULL);
3313 		LASSERT(ldlm_has_layout(lock));
3314 		if (result == 0) {
3315 			/* it can only be allowed to match after layout is
3316 			 * applied to inode otherwise false layout would be
3317 			 * seen. Applying layout should happen before dropping
3318 			 * the intent lock. */
3319 			ldlm_lock_allow_match(lock);
3320 		}
3321 	}
3322 	return result;
3323 }
3324 
3325 /* Fetch layout from MDT with getxattr request, if it's not ready yet */
ll_layout_fetch(struct inode * inode,struct ldlm_lock * lock)3326 static int ll_layout_fetch(struct inode *inode, struct ldlm_lock *lock)
3327 
3328 {
3329 	struct ll_sb_info *sbi = ll_i2sbi(inode);
3330 	struct obd_capa *oc;
3331 	struct ptlrpc_request *req;
3332 	struct mdt_body *body;
3333 	void *lvbdata;
3334 	void *lmm;
3335 	int lmmsize;
3336 	int rc;
3337 
3338 	CDEBUG(D_INODE, DFID" LVB_READY=%d l_lvb_data=%p l_lvb_len=%d\n",
3339 	       PFID(ll_inode2fid(inode)), !!(lock->l_flags & LDLM_FL_LVB_READY),
3340 	       lock->l_lvb_data, lock->l_lvb_len);
3341 
3342 	if ((lock->l_lvb_data != NULL) && (lock->l_flags & LDLM_FL_LVB_READY))
3343 		return 0;
3344 
3345 	/* if layout lock was granted right away, the layout is returned
3346 	 * within DLM_LVB of dlm reply; otherwise if the lock was ever
3347 	 * blocked and then granted via completion ast, we have to fetch
3348 	 * layout here. Please note that we can't use the LVB buffer in
3349 	 * completion AST because it doesn't have a large enough buffer */
3350 	oc = ll_mdscapa_get(inode);
3351 	rc = ll_get_default_mdsize(sbi, &lmmsize);
3352 	if (rc == 0)
3353 		rc = md_getxattr(sbi->ll_md_exp, ll_inode2fid(inode), oc,
3354 				OBD_MD_FLXATTR, XATTR_NAME_LOV, NULL, 0,
3355 				lmmsize, 0, &req);
3356 	capa_put(oc);
3357 	if (rc < 0)
3358 		return rc;
3359 
3360 	body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
3361 	if (body == NULL) {
3362 		rc = -EPROTO;
3363 		goto out;
3364 	}
3365 
3366 	lmmsize = body->eadatasize;
3367 	if (lmmsize == 0) /* empty layout */ {
3368 		rc = 0;
3369 		goto out;
3370 	}
3371 
3372 	lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_EADATA, lmmsize);
3373 	if (lmm == NULL) {
3374 		rc = -EFAULT;
3375 		goto out;
3376 	}
3377 
3378 	OBD_ALLOC_LARGE(lvbdata, lmmsize);
3379 	if (lvbdata == NULL) {
3380 		rc = -ENOMEM;
3381 		goto out;
3382 	}
3383 
3384 	memcpy(lvbdata, lmm, lmmsize);
3385 	lock_res_and_lock(lock);
3386 	if (lock->l_lvb_data != NULL)
3387 		OBD_FREE_LARGE(lock->l_lvb_data, lock->l_lvb_len);
3388 
3389 	lock->l_lvb_data = lvbdata;
3390 	lock->l_lvb_len = lmmsize;
3391 	unlock_res_and_lock(lock);
3392 
3393 out:
3394 	ptlrpc_req_finished(req);
3395 	return rc;
3396 }
3397 
3398 /**
3399  * Apply the layout to the inode. Layout lock is held and will be released
3400  * in this function.
3401  */
ll_layout_lock_set(struct lustre_handle * lockh,ldlm_mode_t mode,struct inode * inode,__u32 * gen,bool reconf)3402 static int ll_layout_lock_set(struct lustre_handle *lockh, ldlm_mode_t mode,
3403 				struct inode *inode, __u32 *gen, bool reconf)
3404 {
3405 	struct ll_inode_info *lli = ll_i2info(inode);
3406 	struct ll_sb_info    *sbi = ll_i2sbi(inode);
3407 	struct ldlm_lock *lock;
3408 	struct lustre_md md = { NULL };
3409 	struct cl_object_conf conf;
3410 	int rc = 0;
3411 	bool lvb_ready;
3412 	bool wait_layout = false;
3413 
3414 	LASSERT(lustre_handle_is_used(lockh));
3415 
3416 	lock = ldlm_handle2lock(lockh);
3417 	LASSERT(lock != NULL);
3418 	LASSERT(ldlm_has_layout(lock));
3419 
3420 	LDLM_DEBUG(lock, "File %p/"DFID" being reconfigured: %d.\n",
3421 		   inode, PFID(&lli->lli_fid), reconf);
3422 
3423 	/* in case this is a caching lock and reinstate with new inode */
3424 	md_set_lock_data(sbi->ll_md_exp, &lockh->cookie, inode, NULL);
3425 
3426 	lock_res_and_lock(lock);
3427 	lvb_ready = !!(lock->l_flags & LDLM_FL_LVB_READY);
3428 	unlock_res_and_lock(lock);
3429 	/* checking lvb_ready is racy but this is okay. The worst case is
3430 	 * that multi processes may configure the file on the same time. */
3431 	if (lvb_ready || !reconf) {
3432 		rc = -ENODATA;
3433 		if (lvb_ready) {
3434 			/* layout_gen must be valid if layout lock is not
3435 			 * cancelled and stripe has already set */
3436 			*gen = ll_layout_version_get(lli);
3437 			rc = 0;
3438 		}
3439 		goto out;
3440 	}
3441 
3442 	rc = ll_layout_fetch(inode, lock);
3443 	if (rc < 0)
3444 		goto out;
3445 
3446 	/* for layout lock, lmm is returned in lock's lvb.
3447 	 * lvb_data is immutable if the lock is held so it's safe to access it
3448 	 * without res lock. See the description in ldlm_lock_decref_internal()
3449 	 * for the condition to free lvb_data of layout lock */
3450 	if (lock->l_lvb_data != NULL) {
3451 		rc = obd_unpackmd(sbi->ll_dt_exp, &md.lsm,
3452 				  lock->l_lvb_data, lock->l_lvb_len);
3453 		if (rc >= 0) {
3454 			*gen = LL_LAYOUT_GEN_EMPTY;
3455 			if (md.lsm != NULL)
3456 				*gen = md.lsm->lsm_layout_gen;
3457 			rc = 0;
3458 		} else {
3459 			CERROR("%s: file "DFID" unpackmd error: %d\n",
3460 				ll_get_fsname(inode->i_sb, NULL, 0),
3461 				PFID(&lli->lli_fid), rc);
3462 		}
3463 	}
3464 	if (rc < 0)
3465 		goto out;
3466 
3467 	/* set layout to file. Unlikely this will fail as old layout was
3468 	 * surely eliminated */
3469 	memset(&conf, 0, sizeof(conf));
3470 	conf.coc_opc = OBJECT_CONF_SET;
3471 	conf.coc_inode = inode;
3472 	conf.coc_lock = lock;
3473 	conf.u.coc_md = &md;
3474 	rc = ll_layout_conf(inode, &conf);
3475 
3476 	if (md.lsm != NULL)
3477 		obd_free_memmd(sbi->ll_dt_exp, &md.lsm);
3478 
3479 	/* refresh layout failed, need to wait */
3480 	wait_layout = rc == -EBUSY;
3481 
3482 out:
3483 	LDLM_LOCK_PUT(lock);
3484 	ldlm_lock_decref(lockh, mode);
3485 
3486 	/* wait for IO to complete if it's still being used. */
3487 	if (wait_layout) {
3488 		CDEBUG(D_INODE, "%s: %p/"DFID" wait for layout reconf.\n",
3489 			ll_get_fsname(inode->i_sb, NULL, 0),
3490 			inode, PFID(&lli->lli_fid));
3491 
3492 		memset(&conf, 0, sizeof(conf));
3493 		conf.coc_opc = OBJECT_CONF_WAIT;
3494 		conf.coc_inode = inode;
3495 		rc = ll_layout_conf(inode, &conf);
3496 		if (rc == 0)
3497 			rc = -EAGAIN;
3498 
3499 		CDEBUG(D_INODE, "file: "DFID" waiting layout return: %d.\n",
3500 			PFID(&lli->lli_fid), rc);
3501 	}
3502 	return rc;
3503 }
3504 
3505 /**
3506  * This function checks if there exists a LAYOUT lock on the client side,
3507  * or enqueues it if it doesn't have one in cache.
3508  *
3509  * This function will not hold layout lock so it may be revoked any time after
3510  * this function returns. Any operations depend on layout should be redone
3511  * in that case.
3512  *
3513  * This function should be called before lov_io_init() to get an uptodate
3514  * layout version, the caller should save the version number and after IO
3515  * is finished, this function should be called again to verify that layout
3516  * is not changed during IO time.
3517  */
ll_layout_refresh(struct inode * inode,__u32 * gen)3518 int ll_layout_refresh(struct inode *inode, __u32 *gen)
3519 {
3520 	struct ll_inode_info  *lli = ll_i2info(inode);
3521 	struct ll_sb_info     *sbi = ll_i2sbi(inode);
3522 	struct md_op_data     *op_data;
3523 	struct lookup_intent   it;
3524 	struct lustre_handle   lockh;
3525 	ldlm_mode_t	       mode;
3526 	struct ldlm_enqueue_info einfo = {
3527 		.ei_type = LDLM_IBITS,
3528 		.ei_mode = LCK_CR,
3529 		.ei_cb_bl = ll_md_blocking_ast,
3530 		.ei_cb_cp = ldlm_completion_ast,
3531 	};
3532 	int rc;
3533 
3534 	*gen = ll_layout_version_get(lli);
3535 	if (!(sbi->ll_flags & LL_SBI_LAYOUT_LOCK) || *gen != LL_LAYOUT_GEN_NONE)
3536 		return 0;
3537 
3538 	/* sanity checks */
3539 	LASSERT(fid_is_sane(ll_inode2fid(inode)));
3540 	LASSERT(S_ISREG(inode->i_mode));
3541 
3542 	/* take layout lock mutex to enqueue layout lock exclusively. */
3543 	mutex_lock(&lli->lli_layout_mutex);
3544 
3545 again:
3546 	/* mostly layout lock is caching on the local side, so try to match
3547 	 * it before grabbing layout lock mutex. */
3548 	mode = ll_take_md_lock(inode, MDS_INODELOCK_LAYOUT, &lockh, 0,
3549 			       LCK_CR | LCK_CW | LCK_PR | LCK_PW);
3550 	if (mode != 0) { /* hit cached lock */
3551 		rc = ll_layout_lock_set(&lockh, mode, inode, gen, true);
3552 		if (rc == -EAGAIN)
3553 			goto again;
3554 
3555 		mutex_unlock(&lli->lli_layout_mutex);
3556 		return rc;
3557 	}
3558 
3559 	op_data = ll_prep_md_op_data(NULL, inode, inode, NULL,
3560 			0, 0, LUSTRE_OPC_ANY, NULL);
3561 	if (IS_ERR(op_data)) {
3562 		mutex_unlock(&lli->lli_layout_mutex);
3563 		return PTR_ERR(op_data);
3564 	}
3565 
3566 	/* have to enqueue one */
3567 	memset(&it, 0, sizeof(it));
3568 	it.it_op = IT_LAYOUT;
3569 	lockh.cookie = 0ULL;
3570 
3571 	LDLM_DEBUG_NOLOCK("%s: requeue layout lock for file %p/"DFID".\n",
3572 			ll_get_fsname(inode->i_sb, NULL, 0), inode,
3573 			PFID(&lli->lli_fid));
3574 
3575 	rc = md_enqueue(sbi->ll_md_exp, &einfo, &it, op_data, &lockh,
3576 			NULL, 0, NULL, 0);
3577 	if (it.d.lustre.it_data != NULL)
3578 		ptlrpc_req_finished(it.d.lustre.it_data);
3579 	it.d.lustre.it_data = NULL;
3580 
3581 	ll_finish_md_op_data(op_data);
3582 
3583 	mode = it.d.lustre.it_lock_mode;
3584 	it.d.lustre.it_lock_mode = 0;
3585 	ll_intent_drop_lock(&it);
3586 
3587 	if (rc == 0) {
3588 		/* set lock data in case this is a new lock */
3589 		ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
3590 		rc = ll_layout_lock_set(&lockh, mode, inode, gen, true);
3591 		if (rc == -EAGAIN)
3592 			goto again;
3593 	}
3594 	mutex_unlock(&lli->lli_layout_mutex);
3595 
3596 	return rc;
3597 }
3598 
3599 /**
3600  *  This function send a restore request to the MDT
3601  */
ll_layout_restore(struct inode * inode)3602 int ll_layout_restore(struct inode *inode)
3603 {
3604 	struct hsm_user_request	*hur;
3605 	int			 len, rc;
3606 
3607 	len = sizeof(struct hsm_user_request) +
3608 	      sizeof(struct hsm_user_item);
3609 	hur = kzalloc(len, GFP_NOFS);
3610 	if (!hur)
3611 		return -ENOMEM;
3612 
3613 	hur->hur_request.hr_action = HUA_RESTORE;
3614 	hur->hur_request.hr_archive_id = 0;
3615 	hur->hur_request.hr_flags = 0;
3616 	memcpy(&hur->hur_user_item[0].hui_fid, &ll_i2info(inode)->lli_fid,
3617 	       sizeof(hur->hur_user_item[0].hui_fid));
3618 	hur->hur_user_item[0].hui_extent.length = -1;
3619 	hur->hur_request.hr_itemcount = 1;
3620 	rc = obd_iocontrol(LL_IOC_HSM_REQUEST, cl_i2sbi(inode)->ll_md_exp,
3621 			   len, hur, NULL);
3622 	OBD_FREE(hur, len);
3623 	return rc;
3624 }
3625