1/* Copyright (c) 2013 Coraid, Inc. See COPYING for GPL terms. */ 2/* 3 * aoedev.c 4 * AoE device utility functions; maintains device list. 5 */ 6 7#include <linux/hdreg.h> 8#include <linux/blkdev.h> 9#include <linux/netdevice.h> 10#include <linux/delay.h> 11#include <linux/slab.h> 12#include <linux/bitmap.h> 13#include <linux/kdev_t.h> 14#include <linux/moduleparam.h> 15#include <linux/string.h> 16#include "aoe.h" 17 18static void dummy_timer(ulong); 19static void freetgt(struct aoedev *d, struct aoetgt *t); 20static void skbpoolfree(struct aoedev *d); 21 22static int aoe_dyndevs = 1; 23module_param(aoe_dyndevs, int, 0644); 24MODULE_PARM_DESC(aoe_dyndevs, "Use dynamic minor numbers for devices."); 25 26static struct aoedev *devlist; 27static DEFINE_SPINLOCK(devlist_lock); 28 29/* Because some systems will have one, many, or no 30 * - partitions, 31 * - slots per shelf, 32 * - or shelves, 33 * we need some flexibility in the way the minor numbers 34 * are allocated. So they are dynamic. 35 */ 36#define N_DEVS ((1U<<MINORBITS)/AOE_PARTITIONS) 37 38static DEFINE_SPINLOCK(used_minors_lock); 39static DECLARE_BITMAP(used_minors, N_DEVS); 40 41static int 42minor_get_dyn(ulong *sysminor) 43{ 44 ulong flags; 45 ulong n; 46 int error = 0; 47 48 spin_lock_irqsave(&used_minors_lock, flags); 49 n = find_first_zero_bit(used_minors, N_DEVS); 50 if (n < N_DEVS) 51 set_bit(n, used_minors); 52 else 53 error = -1; 54 spin_unlock_irqrestore(&used_minors_lock, flags); 55 56 *sysminor = n * AOE_PARTITIONS; 57 return error; 58} 59 60static int 61minor_get_static(ulong *sysminor, ulong aoemaj, int aoemin) 62{ 63 ulong flags; 64 ulong n; 65 int error = 0; 66 enum { 67 /* for backwards compatibility when !aoe_dyndevs, 68 * a static number of supported slots per shelf */ 69 NPERSHELF = 16, 70 }; 71 72 if (aoemin >= NPERSHELF) { 73 pr_err("aoe: %s %d slots per shelf\n", 74 "static minor device numbers support only", 75 NPERSHELF); 76 error = -1; 77 goto out; 78 } 79 80 n = aoemaj * NPERSHELF + aoemin; 81 if (n >= N_DEVS) { 82 pr_err("aoe: %s with e%ld.%d\n", 83 "cannot use static minor device numbers", 84 aoemaj, aoemin); 85 error = -1; 86 goto out; 87 } 88 89 spin_lock_irqsave(&used_minors_lock, flags); 90 if (test_bit(n, used_minors)) { 91 pr_err("aoe: %s %lu\n", 92 "existing device already has static minor number", 93 n); 94 error = -1; 95 } else 96 set_bit(n, used_minors); 97 spin_unlock_irqrestore(&used_minors_lock, flags); 98 *sysminor = n * AOE_PARTITIONS; 99out: 100 return error; 101} 102 103static int 104minor_get(ulong *sysminor, ulong aoemaj, int aoemin) 105{ 106 if (aoe_dyndevs) 107 return minor_get_dyn(sysminor); 108 else 109 return minor_get_static(sysminor, aoemaj, aoemin); 110} 111 112static void 113minor_free(ulong minor) 114{ 115 ulong flags; 116 117 minor /= AOE_PARTITIONS; 118 BUG_ON(minor >= N_DEVS); 119 120 spin_lock_irqsave(&used_minors_lock, flags); 121 BUG_ON(!test_bit(minor, used_minors)); 122 clear_bit(minor, used_minors); 123 spin_unlock_irqrestore(&used_minors_lock, flags); 124} 125 126/* 127 * Users who grab a pointer to the device with aoedev_by_aoeaddr 128 * automatically get a reference count and must be responsible 129 * for performing a aoedev_put. With the addition of async 130 * kthread processing I'm no longer confident that we can 131 * guarantee consistency in the face of device flushes. 132 * 133 * For the time being, we only bother to add extra references for 134 * frames sitting on the iocq. When the kthreads finish processing 135 * these frames, they will aoedev_put the device. 136 */ 137 138void 139aoedev_put(struct aoedev *d) 140{ 141 ulong flags; 142 143 spin_lock_irqsave(&devlist_lock, flags); 144 d->ref--; 145 spin_unlock_irqrestore(&devlist_lock, flags); 146} 147 148static void 149dummy_timer(ulong vp) 150{ 151 struct aoedev *d; 152 153 d = (struct aoedev *)vp; 154 if (d->flags & DEVFL_TKILL) 155 return; 156 d->timer.expires = jiffies + HZ; 157 add_timer(&d->timer); 158} 159 160static void 161aoe_failip(struct aoedev *d) 162{ 163 struct request *rq; 164 struct bio *bio; 165 unsigned long n; 166 167 aoe_failbuf(d, d->ip.buf); 168 169 rq = d->ip.rq; 170 if (rq == NULL) 171 return; 172 while ((bio = d->ip.nxbio)) { 173 bio->bi_error = -EIO; 174 d->ip.nxbio = bio->bi_next; 175 n = (unsigned long) rq->special; 176 rq->special = (void *) --n; 177 } 178 if ((unsigned long) rq->special == 0) 179 aoe_end_request(d, rq, 0); 180} 181 182static void 183downdev_frame(struct list_head *pos) 184{ 185 struct frame *f; 186 187 f = list_entry(pos, struct frame, head); 188 list_del(pos); 189 if (f->buf) { 190 f->buf->nframesout--; 191 aoe_failbuf(f->t->d, f->buf); 192 } 193 aoe_freetframe(f); 194} 195 196void 197aoedev_downdev(struct aoedev *d) 198{ 199 struct aoetgt *t, **tt, **te; 200 struct list_head *head, *pos, *nx; 201 struct request *rq; 202 int i; 203 204 d->flags &= ~DEVFL_UP; 205 206 /* clean out active and to-be-retransmitted buffers */ 207 for (i = 0; i < NFACTIVE; i++) { 208 head = &d->factive[i]; 209 list_for_each_safe(pos, nx, head) 210 downdev_frame(pos); 211 } 212 head = &d->rexmitq; 213 list_for_each_safe(pos, nx, head) 214 downdev_frame(pos); 215 216 /* reset window dressings */ 217 tt = d->targets; 218 te = tt + d->ntargets; 219 for (; tt < te && (t = *tt); tt++) { 220 aoecmd_wreset(t); 221 t->nout = 0; 222 } 223 224 /* clean out the in-process request (if any) */ 225 aoe_failip(d); 226 227 /* fast fail all pending I/O */ 228 if (d->blkq) { 229 while ((rq = blk_peek_request(d->blkq))) { 230 blk_start_request(rq); 231 aoe_end_request(d, rq, 1); 232 } 233 } 234 235 if (d->gd) 236 set_capacity(d->gd, 0); 237} 238 239/* return whether the user asked for this particular 240 * device to be flushed 241 */ 242static int 243user_req(char *s, size_t slen, struct aoedev *d) 244{ 245 const char *p; 246 size_t lim; 247 248 if (!d->gd) 249 return 0; 250 p = kbasename(d->gd->disk_name); 251 lim = sizeof(d->gd->disk_name); 252 lim -= p - d->gd->disk_name; 253 if (slen < lim) 254 lim = slen; 255 256 return !strncmp(s, p, lim); 257} 258 259static void 260freedev(struct aoedev *d) 261{ 262 struct aoetgt **t, **e; 263 int freeing = 0; 264 unsigned long flags; 265 266 spin_lock_irqsave(&d->lock, flags); 267 if (d->flags & DEVFL_TKILL 268 && !(d->flags & DEVFL_FREEING)) { 269 d->flags |= DEVFL_FREEING; 270 freeing = 1; 271 } 272 spin_unlock_irqrestore(&d->lock, flags); 273 if (!freeing) 274 return; 275 276 del_timer_sync(&d->timer); 277 if (d->gd) { 278 aoedisk_rm_debugfs(d); 279 aoedisk_rm_sysfs(d); 280 del_gendisk(d->gd); 281 put_disk(d->gd); 282 blk_cleanup_queue(d->blkq); 283 } 284 t = d->targets; 285 e = t + d->ntargets; 286 for (; t < e && *t; t++) 287 freetgt(d, *t); 288 if (d->bufpool) 289 mempool_destroy(d->bufpool); 290 skbpoolfree(d); 291 minor_free(d->sysminor); 292 293 spin_lock_irqsave(&d->lock, flags); 294 d->flags |= DEVFL_FREED; 295 spin_unlock_irqrestore(&d->lock, flags); 296} 297 298enum flush_parms { 299 NOT_EXITING = 0, 300 EXITING = 1, 301}; 302 303static int 304flush(const char __user *str, size_t cnt, int exiting) 305{ 306 ulong flags; 307 struct aoedev *d, **dd; 308 char buf[16]; 309 int all = 0; 310 int specified = 0; /* flush a specific device */ 311 unsigned int skipflags; 312 313 skipflags = DEVFL_GDALLOC | DEVFL_NEWSIZE | DEVFL_TKILL; 314 315 if (!exiting && cnt >= 3) { 316 if (cnt > sizeof buf) 317 cnt = sizeof buf; 318 if (copy_from_user(buf, str, cnt)) 319 return -EFAULT; 320 all = !strncmp(buf, "all", 3); 321 if (!all) 322 specified = 1; 323 } 324 325 flush_scheduled_work(); 326 /* pass one: without sleeping, do aoedev_downdev */ 327 spin_lock_irqsave(&devlist_lock, flags); 328 for (d = devlist; d; d = d->next) { 329 spin_lock(&d->lock); 330 if (exiting) { 331 /* unconditionally take each device down */ 332 } else if (specified) { 333 if (!user_req(buf, cnt, d)) 334 goto cont; 335 } else if ((!all && (d->flags & DEVFL_UP)) 336 || d->flags & skipflags 337 || d->nopen 338 || d->ref) 339 goto cont; 340 341 aoedev_downdev(d); 342 d->flags |= DEVFL_TKILL; 343cont: 344 spin_unlock(&d->lock); 345 } 346 spin_unlock_irqrestore(&devlist_lock, flags); 347 348 /* pass two: call freedev, which might sleep, 349 * for aoedevs marked with DEVFL_TKILL 350 */ 351restart: 352 spin_lock_irqsave(&devlist_lock, flags); 353 for (d = devlist; d; d = d->next) { 354 spin_lock(&d->lock); 355 if (d->flags & DEVFL_TKILL 356 && !(d->flags & DEVFL_FREEING)) { 357 spin_unlock(&d->lock); 358 spin_unlock_irqrestore(&devlist_lock, flags); 359 freedev(d); 360 goto restart; 361 } 362 spin_unlock(&d->lock); 363 } 364 365 /* pass three: remove aoedevs marked with DEVFL_FREED */ 366 for (dd = &devlist, d = *dd; d; d = *dd) { 367 struct aoedev *doomed = NULL; 368 369 spin_lock(&d->lock); 370 if (d->flags & DEVFL_FREED) { 371 *dd = d->next; 372 doomed = d; 373 } else { 374 dd = &d->next; 375 } 376 spin_unlock(&d->lock); 377 if (doomed) 378 kfree(doomed->targets); 379 kfree(doomed); 380 } 381 spin_unlock_irqrestore(&devlist_lock, flags); 382 383 return 0; 384} 385 386int 387aoedev_flush(const char __user *str, size_t cnt) 388{ 389 return flush(str, cnt, NOT_EXITING); 390} 391 392/* This has been confirmed to occur once with Tms=3*1000 due to the 393 * driver changing link and not processing its transmit ring. The 394 * problem is hard enough to solve by returning an error that I'm 395 * still punting on "solving" this. 396 */ 397static void 398skbfree(struct sk_buff *skb) 399{ 400 enum { Sms = 250, Tms = 30 * 1000}; 401 int i = Tms / Sms; 402 403 if (skb == NULL) 404 return; 405 while (atomic_read(&skb_shinfo(skb)->dataref) != 1 && i-- > 0) 406 msleep(Sms); 407 if (i < 0) { 408 printk(KERN_ERR 409 "aoe: %s holds ref: %s\n", 410 skb->dev ? skb->dev->name : "netif", 411 "cannot free skb -- memory leaked."); 412 return; 413 } 414 skb->truesize -= skb->data_len; 415 skb_shinfo(skb)->nr_frags = skb->data_len = 0; 416 skb_trim(skb, 0); 417 dev_kfree_skb(skb); 418} 419 420static void 421skbpoolfree(struct aoedev *d) 422{ 423 struct sk_buff *skb, *tmp; 424 425 skb_queue_walk_safe(&d->skbpool, skb, tmp) 426 skbfree(skb); 427 428 __skb_queue_head_init(&d->skbpool); 429} 430 431/* find it or allocate it */ 432struct aoedev * 433aoedev_by_aoeaddr(ulong maj, int min, int do_alloc) 434{ 435 struct aoedev *d; 436 int i; 437 ulong flags; 438 ulong sysminor = 0; 439 440 spin_lock_irqsave(&devlist_lock, flags); 441 442 for (d=devlist; d; d=d->next) 443 if (d->aoemajor == maj && d->aoeminor == min) { 444 spin_lock(&d->lock); 445 if (d->flags & DEVFL_TKILL) { 446 spin_unlock(&d->lock); 447 d = NULL; 448 goto out; 449 } 450 d->ref++; 451 spin_unlock(&d->lock); 452 break; 453 } 454 if (d || !do_alloc || minor_get(&sysminor, maj, min) < 0) 455 goto out; 456 d = kcalloc(1, sizeof *d, GFP_ATOMIC); 457 if (!d) 458 goto out; 459 d->targets = kcalloc(NTARGETS, sizeof(*d->targets), GFP_ATOMIC); 460 if (!d->targets) { 461 kfree(d); 462 d = NULL; 463 goto out; 464 } 465 d->ntargets = NTARGETS; 466 INIT_WORK(&d->work, aoecmd_sleepwork); 467 spin_lock_init(&d->lock); 468 skb_queue_head_init(&d->skbpool); 469 init_timer(&d->timer); 470 d->timer.data = (ulong) d; 471 d->timer.function = dummy_timer; 472 d->timer.expires = jiffies + HZ; 473 add_timer(&d->timer); 474 d->bufpool = NULL; /* defer to aoeblk_gdalloc */ 475 d->tgt = d->targets; 476 d->ref = 1; 477 for (i = 0; i < NFACTIVE; i++) 478 INIT_LIST_HEAD(&d->factive[i]); 479 INIT_LIST_HEAD(&d->rexmitq); 480 d->sysminor = sysminor; 481 d->aoemajor = maj; 482 d->aoeminor = min; 483 d->rttavg = RTTAVG_INIT; 484 d->rttdev = RTTDEV_INIT; 485 d->next = devlist; 486 devlist = d; 487 out: 488 spin_unlock_irqrestore(&devlist_lock, flags); 489 return d; 490} 491 492static void 493freetgt(struct aoedev *d, struct aoetgt *t) 494{ 495 struct frame *f; 496 struct list_head *pos, *nx, *head; 497 struct aoeif *ifp; 498 499 for (ifp = t->ifs; ifp < &t->ifs[NAOEIFS]; ++ifp) { 500 if (!ifp->nd) 501 break; 502 dev_put(ifp->nd); 503 } 504 505 head = &t->ffree; 506 list_for_each_safe(pos, nx, head) { 507 list_del(pos); 508 f = list_entry(pos, struct frame, head); 509 skbfree(f->skb); 510 kfree(f); 511 } 512 kfree(t); 513} 514 515void 516aoedev_exit(void) 517{ 518 flush_scheduled_work(); 519 flush(NULL, 0, EXITING); 520} 521 522int __init 523aoedev_init(void) 524{ 525 return 0; 526} 527