1/****************************************************************************** 2 * 3 * Back-end of the driver for virtual block devices. This portion of the 4 * driver exports a 'unified' block-device interface that can be accessed 5 * by any operating system that implements a compatible front end. A 6 * reference front-end implementation can be found in: 7 * drivers/block/xen-blkfront.c 8 * 9 * Copyright (c) 2003-2004, Keir Fraser & Steve Hand 10 * Copyright (c) 2005, Christopher Clark 11 * 12 * This program is free software; you can redistribute it and/or 13 * modify it under the terms of the GNU General Public License version 2 14 * as published by the Free Software Foundation; or, when distributed 15 * separately from the Linux kernel or incorporated into other 16 * software packages, subject to the following license: 17 * 18 * Permission is hereby granted, free of charge, to any person obtaining a copy 19 * of this source file (the "Software"), to deal in the Software without 20 * restriction, including without limitation the rights to use, copy, modify, 21 * merge, publish, distribute, sublicense, and/or sell copies of the Software, 22 * and to permit persons to whom the Software is furnished to do so, subject to 23 * the following conditions: 24 * 25 * The above copyright notice and this permission notice shall be included in 26 * all copies or substantial portions of the Software. 27 * 28 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 29 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 30 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 31 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 32 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 33 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 34 * IN THE SOFTWARE. 35 */ 36 37#define pr_fmt(fmt) "xen-blkback: " fmt 38 39#include <linux/spinlock.h> 40#include <linux/kthread.h> 41#include <linux/list.h> 42#include <linux/delay.h> 43#include <linux/freezer.h> 44#include <linux/bitmap.h> 45 46#include <xen/events.h> 47#include <xen/page.h> 48#include <xen/xen.h> 49#include <asm/xen/hypervisor.h> 50#include <asm/xen/hypercall.h> 51#include <xen/balloon.h> 52#include <xen/grant_table.h> 53#include "common.h" 54 55/* 56 * Maximum number of unused free pages to keep in the internal buffer. 57 * Setting this to a value too low will reduce memory used in each backend, 58 * but can have a performance penalty. 59 * 60 * A sane value is xen_blkif_reqs * BLKIF_MAX_SEGMENTS_PER_REQUEST, but can 61 * be set to a lower value that might degrade performance on some intensive 62 * IO workloads. 63 */ 64 65static int xen_blkif_max_buffer_pages = 1024; 66module_param_named(max_buffer_pages, xen_blkif_max_buffer_pages, int, 0644); 67MODULE_PARM_DESC(max_buffer_pages, 68"Maximum number of free pages to keep in each block backend buffer"); 69 70/* 71 * Maximum number of grants to map persistently in blkback. For maximum 72 * performance this should be the total numbers of grants that can be used 73 * to fill the ring, but since this might become too high, specially with 74 * the use of indirect descriptors, we set it to a value that provides good 75 * performance without using too much memory. 76 * 77 * When the list of persistent grants is full we clean it up using a LRU 78 * algorithm. 79 */ 80 81static int xen_blkif_max_pgrants = 1056; 82module_param_named(max_persistent_grants, xen_blkif_max_pgrants, int, 0644); 83MODULE_PARM_DESC(max_persistent_grants, 84 "Maximum number of grants to map persistently"); 85 86/* 87 * The LRU mechanism to clean the lists of persistent grants needs to 88 * be executed periodically. The time interval between consecutive executions 89 * of the purge mechanism is set in ms. 90 */ 91#define LRU_INTERVAL 100 92 93/* 94 * When the persistent grants list is full we will remove unused grants 95 * from the list. The percent number of grants to be removed at each LRU 96 * execution. 97 */ 98#define LRU_PERCENT_CLEAN 5 99 100/* Run-time switchable: /sys/module/blkback/parameters/ */ 101static unsigned int log_stats; 102module_param(log_stats, int, 0644); 103 104#define BLKBACK_INVALID_HANDLE (~0) 105 106/* Number of free pages to remove on each call to gnttab_free_pages */ 107#define NUM_BATCH_FREE_PAGES 10 108 109static inline int get_free_page(struct xen_blkif *blkif, struct page **page) 110{ 111 unsigned long flags; 112 113 spin_lock_irqsave(&blkif->free_pages_lock, flags); 114 if (list_empty(&blkif->free_pages)) { 115 BUG_ON(blkif->free_pages_num != 0); 116 spin_unlock_irqrestore(&blkif->free_pages_lock, flags); 117 return gnttab_alloc_pages(1, page); 118 } 119 BUG_ON(blkif->free_pages_num == 0); 120 page[0] = list_first_entry(&blkif->free_pages, struct page, lru); 121 list_del(&page[0]->lru); 122 blkif->free_pages_num--; 123 spin_unlock_irqrestore(&blkif->free_pages_lock, flags); 124 125 return 0; 126} 127 128static inline void put_free_pages(struct xen_blkif *blkif, struct page **page, 129 int num) 130{ 131 unsigned long flags; 132 int i; 133 134 spin_lock_irqsave(&blkif->free_pages_lock, flags); 135 for (i = 0; i < num; i++) 136 list_add(&page[i]->lru, &blkif->free_pages); 137 blkif->free_pages_num += num; 138 spin_unlock_irqrestore(&blkif->free_pages_lock, flags); 139} 140 141static inline void shrink_free_pagepool(struct xen_blkif *blkif, int num) 142{ 143 /* Remove requested pages in batches of NUM_BATCH_FREE_PAGES */ 144 struct page *page[NUM_BATCH_FREE_PAGES]; 145 unsigned int num_pages = 0; 146 unsigned long flags; 147 148 spin_lock_irqsave(&blkif->free_pages_lock, flags); 149 while (blkif->free_pages_num > num) { 150 BUG_ON(list_empty(&blkif->free_pages)); 151 page[num_pages] = list_first_entry(&blkif->free_pages, 152 struct page, lru); 153 list_del(&page[num_pages]->lru); 154 blkif->free_pages_num--; 155 if (++num_pages == NUM_BATCH_FREE_PAGES) { 156 spin_unlock_irqrestore(&blkif->free_pages_lock, flags); 157 gnttab_free_pages(num_pages, page); 158 spin_lock_irqsave(&blkif->free_pages_lock, flags); 159 num_pages = 0; 160 } 161 } 162 spin_unlock_irqrestore(&blkif->free_pages_lock, flags); 163 if (num_pages != 0) 164 gnttab_free_pages(num_pages, page); 165} 166 167#define vaddr(page) ((unsigned long)pfn_to_kaddr(page_to_pfn(page))) 168 169static int do_block_io_op(struct xen_blkif *blkif); 170static int dispatch_rw_block_io(struct xen_blkif *blkif, 171 struct blkif_request *req, 172 struct pending_req *pending_req); 173static void make_response(struct xen_blkif *blkif, u64 id, 174 unsigned short op, int st); 175 176#define foreach_grant_safe(pos, n, rbtree, node) \ 177 for ((pos) = container_of(rb_first((rbtree)), typeof(*(pos)), node), \ 178 (n) = (&(pos)->node != NULL) ? rb_next(&(pos)->node) : NULL; \ 179 &(pos)->node != NULL; \ 180 (pos) = container_of(n, typeof(*(pos)), node), \ 181 (n) = (&(pos)->node != NULL) ? rb_next(&(pos)->node) : NULL) 182 183 184/* 185 * We don't need locking around the persistent grant helpers 186 * because blkback uses a single-thread for each backed, so we 187 * can be sure that this functions will never be called recursively. 188 * 189 * The only exception to that is put_persistent_grant, that can be called 190 * from interrupt context (by xen_blkbk_unmap), so we have to use atomic 191 * bit operations to modify the flags of a persistent grant and to count 192 * the number of used grants. 193 */ 194static int add_persistent_gnt(struct xen_blkif *blkif, 195 struct persistent_gnt *persistent_gnt) 196{ 197 struct rb_node **new = NULL, *parent = NULL; 198 struct persistent_gnt *this; 199 200 if (blkif->persistent_gnt_c >= xen_blkif_max_pgrants) { 201 if (!blkif->vbd.overflow_max_grants) 202 blkif->vbd.overflow_max_grants = 1; 203 return -EBUSY; 204 } 205 /* Figure out where to put new node */ 206 new = &blkif->persistent_gnts.rb_node; 207 while (*new) { 208 this = container_of(*new, struct persistent_gnt, node); 209 210 parent = *new; 211 if (persistent_gnt->gnt < this->gnt) 212 new = &((*new)->rb_left); 213 else if (persistent_gnt->gnt > this->gnt) 214 new = &((*new)->rb_right); 215 else { 216 pr_alert_ratelimited("trying to add a gref that's already in the tree\n"); 217 return -EINVAL; 218 } 219 } 220 221 bitmap_zero(persistent_gnt->flags, PERSISTENT_GNT_FLAGS_SIZE); 222 set_bit(PERSISTENT_GNT_ACTIVE, persistent_gnt->flags); 223 /* Add new node and rebalance tree. */ 224 rb_link_node(&(persistent_gnt->node), parent, new); 225 rb_insert_color(&(persistent_gnt->node), &blkif->persistent_gnts); 226 blkif->persistent_gnt_c++; 227 atomic_inc(&blkif->persistent_gnt_in_use); 228 return 0; 229} 230 231static struct persistent_gnt *get_persistent_gnt(struct xen_blkif *blkif, 232 grant_ref_t gref) 233{ 234 struct persistent_gnt *data; 235 struct rb_node *node = NULL; 236 237 node = blkif->persistent_gnts.rb_node; 238 while (node) { 239 data = container_of(node, struct persistent_gnt, node); 240 241 if (gref < data->gnt) 242 node = node->rb_left; 243 else if (gref > data->gnt) 244 node = node->rb_right; 245 else { 246 if(test_bit(PERSISTENT_GNT_ACTIVE, data->flags)) { 247 pr_alert_ratelimited("requesting a grant already in use\n"); 248 return NULL; 249 } 250 set_bit(PERSISTENT_GNT_ACTIVE, data->flags); 251 atomic_inc(&blkif->persistent_gnt_in_use); 252 return data; 253 } 254 } 255 return NULL; 256} 257 258static void put_persistent_gnt(struct xen_blkif *blkif, 259 struct persistent_gnt *persistent_gnt) 260{ 261 if(!test_bit(PERSISTENT_GNT_ACTIVE, persistent_gnt->flags)) 262 pr_alert_ratelimited("freeing a grant already unused\n"); 263 set_bit(PERSISTENT_GNT_WAS_ACTIVE, persistent_gnt->flags); 264 clear_bit(PERSISTENT_GNT_ACTIVE, persistent_gnt->flags); 265 atomic_dec(&blkif->persistent_gnt_in_use); 266} 267 268static void free_persistent_gnts(struct xen_blkif *blkif, struct rb_root *root, 269 unsigned int num) 270{ 271 struct gnttab_unmap_grant_ref unmap[BLKIF_MAX_SEGMENTS_PER_REQUEST]; 272 struct page *pages[BLKIF_MAX_SEGMENTS_PER_REQUEST]; 273 struct persistent_gnt *persistent_gnt; 274 struct rb_node *n; 275 int segs_to_unmap = 0; 276 struct gntab_unmap_queue_data unmap_data; 277 278 unmap_data.pages = pages; 279 unmap_data.unmap_ops = unmap; 280 unmap_data.kunmap_ops = NULL; 281 282 foreach_grant_safe(persistent_gnt, n, root, node) { 283 BUG_ON(persistent_gnt->handle == 284 BLKBACK_INVALID_HANDLE); 285 gnttab_set_unmap_op(&unmap[segs_to_unmap], 286 (unsigned long) pfn_to_kaddr(page_to_pfn( 287 persistent_gnt->page)), 288 GNTMAP_host_map, 289 persistent_gnt->handle); 290 291 pages[segs_to_unmap] = persistent_gnt->page; 292 293 if (++segs_to_unmap == BLKIF_MAX_SEGMENTS_PER_REQUEST || 294 !rb_next(&persistent_gnt->node)) { 295 296 unmap_data.count = segs_to_unmap; 297 BUG_ON(gnttab_unmap_refs_sync(&unmap_data)); 298 299 put_free_pages(blkif, pages, segs_to_unmap); 300 segs_to_unmap = 0; 301 } 302 303 rb_erase(&persistent_gnt->node, root); 304 kfree(persistent_gnt); 305 num--; 306 } 307 BUG_ON(num != 0); 308} 309 310void xen_blkbk_unmap_purged_grants(struct work_struct *work) 311{ 312 struct gnttab_unmap_grant_ref unmap[BLKIF_MAX_SEGMENTS_PER_REQUEST]; 313 struct page *pages[BLKIF_MAX_SEGMENTS_PER_REQUEST]; 314 struct persistent_gnt *persistent_gnt; 315 int segs_to_unmap = 0; 316 struct xen_blkif *blkif = container_of(work, typeof(*blkif), persistent_purge_work); 317 struct gntab_unmap_queue_data unmap_data; 318 319 unmap_data.pages = pages; 320 unmap_data.unmap_ops = unmap; 321 unmap_data.kunmap_ops = NULL; 322 323 while(!list_empty(&blkif->persistent_purge_list)) { 324 persistent_gnt = list_first_entry(&blkif->persistent_purge_list, 325 struct persistent_gnt, 326 remove_node); 327 list_del(&persistent_gnt->remove_node); 328 329 gnttab_set_unmap_op(&unmap[segs_to_unmap], 330 vaddr(persistent_gnt->page), 331 GNTMAP_host_map, 332 persistent_gnt->handle); 333 334 pages[segs_to_unmap] = persistent_gnt->page; 335 336 if (++segs_to_unmap == BLKIF_MAX_SEGMENTS_PER_REQUEST) { 337 unmap_data.count = segs_to_unmap; 338 BUG_ON(gnttab_unmap_refs_sync(&unmap_data)); 339 put_free_pages(blkif, pages, segs_to_unmap); 340 segs_to_unmap = 0; 341 } 342 kfree(persistent_gnt); 343 } 344 if (segs_to_unmap > 0) { 345 unmap_data.count = segs_to_unmap; 346 BUG_ON(gnttab_unmap_refs_sync(&unmap_data)); 347 put_free_pages(blkif, pages, segs_to_unmap); 348 } 349} 350 351static void purge_persistent_gnt(struct xen_blkif *blkif) 352{ 353 struct persistent_gnt *persistent_gnt; 354 struct rb_node *n; 355 unsigned int num_clean, total; 356 bool scan_used = false, clean_used = false; 357 struct rb_root *root; 358 359 if (blkif->persistent_gnt_c < xen_blkif_max_pgrants || 360 (blkif->persistent_gnt_c == xen_blkif_max_pgrants && 361 !blkif->vbd.overflow_max_grants)) { 362 return; 363 } 364 365 if (work_busy(&blkif->persistent_purge_work)) { 366 pr_alert_ratelimited("Scheduled work from previous purge is still busy, cannot purge list\n"); 367 return; 368 } 369 370 num_clean = (xen_blkif_max_pgrants / 100) * LRU_PERCENT_CLEAN; 371 num_clean = blkif->persistent_gnt_c - xen_blkif_max_pgrants + num_clean; 372 num_clean = min(blkif->persistent_gnt_c, num_clean); 373 if ((num_clean == 0) || 374 (num_clean > (blkif->persistent_gnt_c - atomic_read(&blkif->persistent_gnt_in_use)))) 375 return; 376 377 /* 378 * At this point, we can assure that there will be no calls 379 * to get_persistent_grant (because we are executing this code from 380 * xen_blkif_schedule), there can only be calls to put_persistent_gnt, 381 * which means that the number of currently used grants will go down, 382 * but never up, so we will always be able to remove the requested 383 * number of grants. 384 */ 385 386 total = num_clean; 387 388 pr_debug("Going to purge %u persistent grants\n", num_clean); 389 390 BUG_ON(!list_empty(&blkif->persistent_purge_list)); 391 root = &blkif->persistent_gnts; 392purge_list: 393 foreach_grant_safe(persistent_gnt, n, root, node) { 394 BUG_ON(persistent_gnt->handle == 395 BLKBACK_INVALID_HANDLE); 396 397 if (clean_used) { 398 clear_bit(PERSISTENT_GNT_WAS_ACTIVE, persistent_gnt->flags); 399 continue; 400 } 401 402 if (test_bit(PERSISTENT_GNT_ACTIVE, persistent_gnt->flags)) 403 continue; 404 if (!scan_used && 405 (test_bit(PERSISTENT_GNT_WAS_ACTIVE, persistent_gnt->flags))) 406 continue; 407 408 rb_erase(&persistent_gnt->node, root); 409 list_add(&persistent_gnt->remove_node, 410 &blkif->persistent_purge_list); 411 if (--num_clean == 0) 412 goto finished; 413 } 414 /* 415 * If we get here it means we also need to start cleaning 416 * grants that were used since last purge in order to cope 417 * with the requested num 418 */ 419 if (!scan_used && !clean_used) { 420 pr_debug("Still missing %u purged frames\n", num_clean); 421 scan_used = true; 422 goto purge_list; 423 } 424finished: 425 if (!clean_used) { 426 pr_debug("Finished scanning for grants to clean, removing used flag\n"); 427 clean_used = true; 428 goto purge_list; 429 } 430 431 blkif->persistent_gnt_c -= (total - num_clean); 432 blkif->vbd.overflow_max_grants = 0; 433 434 /* We can defer this work */ 435 schedule_work(&blkif->persistent_purge_work); 436 pr_debug("Purged %u/%u\n", (total - num_clean), total); 437 return; 438} 439 440/* 441 * Retrieve from the 'pending_reqs' a free pending_req structure to be used. 442 */ 443static struct pending_req *alloc_req(struct xen_blkif *blkif) 444{ 445 struct pending_req *req = NULL; 446 unsigned long flags; 447 448 spin_lock_irqsave(&blkif->pending_free_lock, flags); 449 if (!list_empty(&blkif->pending_free)) { 450 req = list_entry(blkif->pending_free.next, struct pending_req, 451 free_list); 452 list_del(&req->free_list); 453 } 454 spin_unlock_irqrestore(&blkif->pending_free_lock, flags); 455 return req; 456} 457 458/* 459 * Return the 'pending_req' structure back to the freepool. We also 460 * wake up the thread if it was waiting for a free page. 461 */ 462static void free_req(struct xen_blkif *blkif, struct pending_req *req) 463{ 464 unsigned long flags; 465 int was_empty; 466 467 spin_lock_irqsave(&blkif->pending_free_lock, flags); 468 was_empty = list_empty(&blkif->pending_free); 469 list_add(&req->free_list, &blkif->pending_free); 470 spin_unlock_irqrestore(&blkif->pending_free_lock, flags); 471 if (was_empty) 472 wake_up(&blkif->pending_free_wq); 473} 474 475/* 476 * Routines for managing virtual block devices (vbds). 477 */ 478static int xen_vbd_translate(struct phys_req *req, struct xen_blkif *blkif, 479 int operation) 480{ 481 struct xen_vbd *vbd = &blkif->vbd; 482 int rc = -EACCES; 483 484 if ((operation != READ) && vbd->readonly) 485 goto out; 486 487 if (likely(req->nr_sects)) { 488 blkif_sector_t end = req->sector_number + req->nr_sects; 489 490 if (unlikely(end < req->sector_number)) 491 goto out; 492 if (unlikely(end > vbd_sz(vbd))) 493 goto out; 494 } 495 496 req->dev = vbd->pdevice; 497 req->bdev = vbd->bdev; 498 rc = 0; 499 500 out: 501 return rc; 502} 503 504static void xen_vbd_resize(struct xen_blkif *blkif) 505{ 506 struct xen_vbd *vbd = &blkif->vbd; 507 struct xenbus_transaction xbt; 508 int err; 509 struct xenbus_device *dev = xen_blkbk_xenbus(blkif->be); 510 unsigned long long new_size = vbd_sz(vbd); 511 512 pr_info("VBD Resize: Domid: %d, Device: (%d, %d)\n", 513 blkif->domid, MAJOR(vbd->pdevice), MINOR(vbd->pdevice)); 514 pr_info("VBD Resize: new size %llu\n", new_size); 515 vbd->size = new_size; 516again: 517 err = xenbus_transaction_start(&xbt); 518 if (err) { 519 pr_warn("Error starting transaction\n"); 520 return; 521 } 522 err = xenbus_printf(xbt, dev->nodename, "sectors", "%llu", 523 (unsigned long long)vbd_sz(vbd)); 524 if (err) { 525 pr_warn("Error writing new size\n"); 526 goto abort; 527 } 528 /* 529 * Write the current state; we will use this to synchronize 530 * the front-end. If the current state is "connected" the 531 * front-end will get the new size information online. 532 */ 533 err = xenbus_printf(xbt, dev->nodename, "state", "%d", dev->state); 534 if (err) { 535 pr_warn("Error writing the state\n"); 536 goto abort; 537 } 538 539 err = xenbus_transaction_end(xbt, 0); 540 if (err == -EAGAIN) 541 goto again; 542 if (err) 543 pr_warn("Error ending transaction\n"); 544 return; 545abort: 546 xenbus_transaction_end(xbt, 1); 547} 548 549/* 550 * Notification from the guest OS. 551 */ 552static void blkif_notify_work(struct xen_blkif *blkif) 553{ 554 blkif->waiting_reqs = 1; 555 wake_up(&blkif->wq); 556} 557 558irqreturn_t xen_blkif_be_int(int irq, void *dev_id) 559{ 560 blkif_notify_work(dev_id); 561 return IRQ_HANDLED; 562} 563 564/* 565 * SCHEDULER FUNCTIONS 566 */ 567 568static void print_stats(struct xen_blkif *blkif) 569{ 570 pr_info("(%s): oo %3llu | rd %4llu | wr %4llu | f %4llu" 571 " | ds %4llu | pg: %4u/%4d\n", 572 current->comm, blkif->st_oo_req, 573 blkif->st_rd_req, blkif->st_wr_req, 574 blkif->st_f_req, blkif->st_ds_req, 575 blkif->persistent_gnt_c, 576 xen_blkif_max_pgrants); 577 blkif->st_print = jiffies + msecs_to_jiffies(10 * 1000); 578 blkif->st_rd_req = 0; 579 blkif->st_wr_req = 0; 580 blkif->st_oo_req = 0; 581 blkif->st_ds_req = 0; 582} 583 584int xen_blkif_schedule(void *arg) 585{ 586 struct xen_blkif *blkif = arg; 587 struct xen_vbd *vbd = &blkif->vbd; 588 unsigned long timeout; 589 int ret; 590 591 xen_blkif_get(blkif); 592 593 while (!kthread_should_stop()) { 594 if (try_to_freeze()) 595 continue; 596 if (unlikely(vbd->size != vbd_sz(vbd))) 597 xen_vbd_resize(blkif); 598 599 timeout = msecs_to_jiffies(LRU_INTERVAL); 600 601 timeout = wait_event_interruptible_timeout( 602 blkif->wq, 603 blkif->waiting_reqs || kthread_should_stop(), 604 timeout); 605 if (timeout == 0) 606 goto purge_gnt_list; 607 timeout = wait_event_interruptible_timeout( 608 blkif->pending_free_wq, 609 !list_empty(&blkif->pending_free) || 610 kthread_should_stop(), 611 timeout); 612 if (timeout == 0) 613 goto purge_gnt_list; 614 615 blkif->waiting_reqs = 0; 616 smp_mb(); /* clear flag *before* checking for work */ 617 618 ret = do_block_io_op(blkif); 619 if (ret > 0) 620 blkif->waiting_reqs = 1; 621 if (ret == -EACCES) 622 wait_event_interruptible(blkif->shutdown_wq, 623 kthread_should_stop()); 624 625purge_gnt_list: 626 if (blkif->vbd.feature_gnt_persistent && 627 time_after(jiffies, blkif->next_lru)) { 628 purge_persistent_gnt(blkif); 629 blkif->next_lru = jiffies + msecs_to_jiffies(LRU_INTERVAL); 630 } 631 632 /* Shrink if we have more than xen_blkif_max_buffer_pages */ 633 shrink_free_pagepool(blkif, xen_blkif_max_buffer_pages); 634 635 if (log_stats && time_after(jiffies, blkif->st_print)) 636 print_stats(blkif); 637 } 638 639 /* Drain pending purge work */ 640 flush_work(&blkif->persistent_purge_work); 641 642 if (log_stats) 643 print_stats(blkif); 644 645 blkif->xenblkd = NULL; 646 xen_blkif_put(blkif); 647 648 return 0; 649} 650 651/* 652 * Remove persistent grants and empty the pool of free pages 653 */ 654void xen_blkbk_free_caches(struct xen_blkif *blkif) 655{ 656 /* Free all persistent grant pages */ 657 if (!RB_EMPTY_ROOT(&blkif->persistent_gnts)) 658 free_persistent_gnts(blkif, &blkif->persistent_gnts, 659 blkif->persistent_gnt_c); 660 661 BUG_ON(!RB_EMPTY_ROOT(&blkif->persistent_gnts)); 662 blkif->persistent_gnt_c = 0; 663 664 /* Since we are shutting down remove all pages from the buffer */ 665 shrink_free_pagepool(blkif, 0 /* All */); 666} 667 668static unsigned int xen_blkbk_unmap_prepare( 669 struct xen_blkif *blkif, 670 struct grant_page **pages, 671 unsigned int num, 672 struct gnttab_unmap_grant_ref *unmap_ops, 673 struct page **unmap_pages) 674{ 675 unsigned int i, invcount = 0; 676 677 for (i = 0; i < num; i++) { 678 if (pages[i]->persistent_gnt != NULL) { 679 put_persistent_gnt(blkif, pages[i]->persistent_gnt); 680 continue; 681 } 682 if (pages[i]->handle == BLKBACK_INVALID_HANDLE) 683 continue; 684 unmap_pages[invcount] = pages[i]->page; 685 gnttab_set_unmap_op(&unmap_ops[invcount], vaddr(pages[i]->page), 686 GNTMAP_host_map, pages[i]->handle); 687 pages[i]->handle = BLKBACK_INVALID_HANDLE; 688 invcount++; 689 } 690 691 return invcount; 692} 693 694static void xen_blkbk_unmap_and_respond_callback(int result, struct gntab_unmap_queue_data *data) 695{ 696 struct pending_req* pending_req = (struct pending_req*) (data->data); 697 struct xen_blkif *blkif = pending_req->blkif; 698 699 /* BUG_ON used to reproduce existing behaviour, 700 but is this the best way to deal with this? */ 701 BUG_ON(result); 702 703 put_free_pages(blkif, data->pages, data->count); 704 make_response(blkif, pending_req->id, 705 pending_req->operation, pending_req->status); 706 free_req(blkif, pending_req); 707 /* 708 * Make sure the request is freed before releasing blkif, 709 * or there could be a race between free_req and the 710 * cleanup done in xen_blkif_free during shutdown. 711 * 712 * NB: The fact that we might try to wake up pending_free_wq 713 * before drain_complete (in case there's a drain going on) 714 * it's not a problem with our current implementation 715 * because we can assure there's no thread waiting on 716 * pending_free_wq if there's a drain going on, but it has 717 * to be taken into account if the current model is changed. 718 */ 719 if (atomic_dec_and_test(&blkif->inflight) && atomic_read(&blkif->drain)) { 720 complete(&blkif->drain_complete); 721 } 722 xen_blkif_put(blkif); 723} 724 725static void xen_blkbk_unmap_and_respond(struct pending_req *req) 726{ 727 struct gntab_unmap_queue_data* work = &req->gnttab_unmap_data; 728 struct xen_blkif *blkif = req->blkif; 729 struct grant_page **pages = req->segments; 730 unsigned int invcount; 731 732 invcount = xen_blkbk_unmap_prepare(blkif, pages, req->nr_pages, 733 req->unmap, req->unmap_pages); 734 735 work->data = req; 736 work->done = xen_blkbk_unmap_and_respond_callback; 737 work->unmap_ops = req->unmap; 738 work->kunmap_ops = NULL; 739 work->pages = req->unmap_pages; 740 work->count = invcount; 741 742 gnttab_unmap_refs_async(&req->gnttab_unmap_data); 743} 744 745 746/* 747 * Unmap the grant references. 748 * 749 * This could accumulate ops up to the batch size to reduce the number 750 * of hypercalls, but since this is only used in error paths there's 751 * no real need. 752 */ 753static void xen_blkbk_unmap(struct xen_blkif *blkif, 754 struct grant_page *pages[], 755 int num) 756{ 757 struct gnttab_unmap_grant_ref unmap[BLKIF_MAX_SEGMENTS_PER_REQUEST]; 758 struct page *unmap_pages[BLKIF_MAX_SEGMENTS_PER_REQUEST]; 759 unsigned int invcount = 0; 760 int ret; 761 762 while (num) { 763 unsigned int batch = min(num, BLKIF_MAX_SEGMENTS_PER_REQUEST); 764 765 invcount = xen_blkbk_unmap_prepare(blkif, pages, batch, 766 unmap, unmap_pages); 767 if (invcount) { 768 ret = gnttab_unmap_refs(unmap, NULL, unmap_pages, invcount); 769 BUG_ON(ret); 770 put_free_pages(blkif, unmap_pages, invcount); 771 } 772 pages += batch; 773 num -= batch; 774 } 775} 776 777static int xen_blkbk_map(struct xen_blkif *blkif, 778 struct grant_page *pages[], 779 int num, bool ro) 780{ 781 struct gnttab_map_grant_ref map[BLKIF_MAX_SEGMENTS_PER_REQUEST]; 782 struct page *pages_to_gnt[BLKIF_MAX_SEGMENTS_PER_REQUEST]; 783 struct persistent_gnt *persistent_gnt = NULL; 784 phys_addr_t addr = 0; 785 int i, seg_idx, new_map_idx; 786 int segs_to_map = 0; 787 int ret = 0; 788 int last_map = 0, map_until = 0; 789 int use_persistent_gnts; 790 791 use_persistent_gnts = (blkif->vbd.feature_gnt_persistent); 792 793 /* 794 * Fill out preq.nr_sects with proper amount of sectors, and setup 795 * assign map[..] with the PFN of the page in our domain with the 796 * corresponding grant reference for each page. 797 */ 798again: 799 for (i = map_until; i < num; i++) { 800 uint32_t flags; 801 802 if (use_persistent_gnts) 803 persistent_gnt = get_persistent_gnt( 804 blkif, 805 pages[i]->gref); 806 807 if (persistent_gnt) { 808 /* 809 * We are using persistent grants and 810 * the grant is already mapped 811 */ 812 pages[i]->page = persistent_gnt->page; 813 pages[i]->persistent_gnt = persistent_gnt; 814 } else { 815 if (get_free_page(blkif, &pages[i]->page)) 816 goto out_of_memory; 817 addr = vaddr(pages[i]->page); 818 pages_to_gnt[segs_to_map] = pages[i]->page; 819 pages[i]->persistent_gnt = NULL; 820 flags = GNTMAP_host_map; 821 if (!use_persistent_gnts && ro) 822 flags |= GNTMAP_readonly; 823 gnttab_set_map_op(&map[segs_to_map++], addr, 824 flags, pages[i]->gref, 825 blkif->domid); 826 } 827 map_until = i + 1; 828 if (segs_to_map == BLKIF_MAX_SEGMENTS_PER_REQUEST) 829 break; 830 } 831 832 if (segs_to_map) { 833 ret = gnttab_map_refs(map, NULL, pages_to_gnt, segs_to_map); 834 BUG_ON(ret); 835 } 836 837 /* 838 * Now swizzle the MFN in our domain with the MFN from the other domain 839 * so that when we access vaddr(pending_req,i) it has the contents of 840 * the page from the other domain. 841 */ 842 for (seg_idx = last_map, new_map_idx = 0; seg_idx < map_until; seg_idx++) { 843 if (!pages[seg_idx]->persistent_gnt) { 844 /* This is a newly mapped grant */ 845 BUG_ON(new_map_idx >= segs_to_map); 846 if (unlikely(map[new_map_idx].status != 0)) { 847 pr_debug("invalid buffer -- could not remap it\n"); 848 put_free_pages(blkif, &pages[seg_idx]->page, 1); 849 pages[seg_idx]->handle = BLKBACK_INVALID_HANDLE; 850 ret |= 1; 851 goto next; 852 } 853 pages[seg_idx]->handle = map[new_map_idx].handle; 854 } else { 855 continue; 856 } 857 if (use_persistent_gnts && 858 blkif->persistent_gnt_c < xen_blkif_max_pgrants) { 859 /* 860 * We are using persistent grants, the grant is 861 * not mapped but we might have room for it. 862 */ 863 persistent_gnt = kmalloc(sizeof(struct persistent_gnt), 864 GFP_KERNEL); 865 if (!persistent_gnt) { 866 /* 867 * If we don't have enough memory to 868 * allocate the persistent_gnt struct 869 * map this grant non-persistenly 870 */ 871 goto next; 872 } 873 persistent_gnt->gnt = map[new_map_idx].ref; 874 persistent_gnt->handle = map[new_map_idx].handle; 875 persistent_gnt->page = pages[seg_idx]->page; 876 if (add_persistent_gnt(blkif, 877 persistent_gnt)) { 878 kfree(persistent_gnt); 879 persistent_gnt = NULL; 880 goto next; 881 } 882 pages[seg_idx]->persistent_gnt = persistent_gnt; 883 pr_debug("grant %u added to the tree of persistent grants, using %u/%u\n", 884 persistent_gnt->gnt, blkif->persistent_gnt_c, 885 xen_blkif_max_pgrants); 886 goto next; 887 } 888 if (use_persistent_gnts && !blkif->vbd.overflow_max_grants) { 889 blkif->vbd.overflow_max_grants = 1; 890 pr_debug("domain %u, device %#x is using maximum number of persistent grants\n", 891 blkif->domid, blkif->vbd.handle); 892 } 893 /* 894 * We could not map this grant persistently, so use it as 895 * a non-persistent grant. 896 */ 897next: 898 new_map_idx++; 899 } 900 segs_to_map = 0; 901 last_map = map_until; 902 if (map_until != num) 903 goto again; 904 905 return ret; 906 907out_of_memory: 908 pr_alert("%s: out of memory\n", __func__); 909 put_free_pages(blkif, pages_to_gnt, segs_to_map); 910 return -ENOMEM; 911} 912 913static int xen_blkbk_map_seg(struct pending_req *pending_req) 914{ 915 int rc; 916 917 rc = xen_blkbk_map(pending_req->blkif, pending_req->segments, 918 pending_req->nr_pages, 919 (pending_req->operation != BLKIF_OP_READ)); 920 921 return rc; 922} 923 924static int xen_blkbk_parse_indirect(struct blkif_request *req, 925 struct pending_req *pending_req, 926 struct seg_buf seg[], 927 struct phys_req *preq) 928{ 929 struct grant_page **pages = pending_req->indirect_pages; 930 struct xen_blkif *blkif = pending_req->blkif; 931 int indirect_grefs, rc, n, nseg, i; 932 struct blkif_request_segment *segments = NULL; 933 934 nseg = pending_req->nr_pages; 935 indirect_grefs = INDIRECT_PAGES(nseg); 936 BUG_ON(indirect_grefs > BLKIF_MAX_INDIRECT_PAGES_PER_REQUEST); 937 938 for (i = 0; i < indirect_grefs; i++) 939 pages[i]->gref = req->u.indirect.indirect_grefs[i]; 940 941 rc = xen_blkbk_map(blkif, pages, indirect_grefs, true); 942 if (rc) 943 goto unmap; 944 945 for (n = 0, i = 0; n < nseg; n++) { 946 if ((n % SEGS_PER_INDIRECT_FRAME) == 0) { 947 /* Map indirect segments */ 948 if (segments) 949 kunmap_atomic(segments); 950 segments = kmap_atomic(pages[n/SEGS_PER_INDIRECT_FRAME]->page); 951 } 952 i = n % SEGS_PER_INDIRECT_FRAME; 953 pending_req->segments[n]->gref = segments[i].gref; 954 seg[n].nsec = segments[i].last_sect - 955 segments[i].first_sect + 1; 956 seg[n].offset = (segments[i].first_sect << 9); 957 if ((segments[i].last_sect >= (PAGE_SIZE >> 9)) || 958 (segments[i].last_sect < segments[i].first_sect)) { 959 rc = -EINVAL; 960 goto unmap; 961 } 962 preq->nr_sects += seg[n].nsec; 963 } 964 965unmap: 966 if (segments) 967 kunmap_atomic(segments); 968 xen_blkbk_unmap(blkif, pages, indirect_grefs); 969 return rc; 970} 971 972static int dispatch_discard_io(struct xen_blkif *blkif, 973 struct blkif_request *req) 974{ 975 int err = 0; 976 int status = BLKIF_RSP_OKAY; 977 struct block_device *bdev = blkif->vbd.bdev; 978 unsigned long secure; 979 struct phys_req preq; 980 981 xen_blkif_get(blkif); 982 983 preq.sector_number = req->u.discard.sector_number; 984 preq.nr_sects = req->u.discard.nr_sectors; 985 986 err = xen_vbd_translate(&preq, blkif, WRITE); 987 if (err) { 988 pr_warn("access denied: DISCARD [%llu->%llu] on dev=%04x\n", 989 preq.sector_number, 990 preq.sector_number + preq.nr_sects, blkif->vbd.pdevice); 991 goto fail_response; 992 } 993 blkif->st_ds_req++; 994 995 secure = (blkif->vbd.discard_secure && 996 (req->u.discard.flag & BLKIF_DISCARD_SECURE)) ? 997 BLKDEV_DISCARD_SECURE : 0; 998 999 err = blkdev_issue_discard(bdev, req->u.discard.sector_number, 1000 req->u.discard.nr_sectors, 1001 GFP_KERNEL, secure); 1002fail_response: 1003 if (err == -EOPNOTSUPP) { 1004 pr_debug("discard op failed, not supported\n"); 1005 status = BLKIF_RSP_EOPNOTSUPP; 1006 } else if (err) 1007 status = BLKIF_RSP_ERROR; 1008 1009 make_response(blkif, req->u.discard.id, req->operation, status); 1010 xen_blkif_put(blkif); 1011 return err; 1012} 1013 1014static int dispatch_other_io(struct xen_blkif *blkif, 1015 struct blkif_request *req, 1016 struct pending_req *pending_req) 1017{ 1018 free_req(blkif, pending_req); 1019 make_response(blkif, req->u.other.id, req->operation, 1020 BLKIF_RSP_EOPNOTSUPP); 1021 return -EIO; 1022} 1023 1024static void xen_blk_drain_io(struct xen_blkif *blkif) 1025{ 1026 atomic_set(&blkif->drain, 1); 1027 do { 1028 if (atomic_read(&blkif->inflight) == 0) 1029 break; 1030 wait_for_completion_interruptible_timeout( 1031 &blkif->drain_complete, HZ); 1032 1033 if (!atomic_read(&blkif->drain)) 1034 break; 1035 } while (!kthread_should_stop()); 1036 atomic_set(&blkif->drain, 0); 1037} 1038 1039/* 1040 * Completion callback on the bio's. Called as bh->b_end_io() 1041 */ 1042 1043static void __end_block_io_op(struct pending_req *pending_req, int error) 1044{ 1045 /* An error fails the entire request. */ 1046 if ((pending_req->operation == BLKIF_OP_FLUSH_DISKCACHE) && 1047 (error == -EOPNOTSUPP)) { 1048 pr_debug("flush diskcache op failed, not supported\n"); 1049 xen_blkbk_flush_diskcache(XBT_NIL, pending_req->blkif->be, 0); 1050 pending_req->status = BLKIF_RSP_EOPNOTSUPP; 1051 } else if ((pending_req->operation == BLKIF_OP_WRITE_BARRIER) && 1052 (error == -EOPNOTSUPP)) { 1053 pr_debug("write barrier op failed, not supported\n"); 1054 xen_blkbk_barrier(XBT_NIL, pending_req->blkif->be, 0); 1055 pending_req->status = BLKIF_RSP_EOPNOTSUPP; 1056 } else if (error) { 1057 pr_debug("Buffer not up-to-date at end of operation," 1058 " error=%d\n", error); 1059 pending_req->status = BLKIF_RSP_ERROR; 1060 } 1061 1062 /* 1063 * If all of the bio's have completed it is time to unmap 1064 * the grant references associated with 'request' and provide 1065 * the proper response on the ring. 1066 */ 1067 if (atomic_dec_and_test(&pending_req->pendcnt)) 1068 xen_blkbk_unmap_and_respond(pending_req); 1069} 1070 1071/* 1072 * bio callback. 1073 */ 1074static void end_block_io_op(struct bio *bio, int error) 1075{ 1076 __end_block_io_op(bio->bi_private, error); 1077 bio_put(bio); 1078} 1079 1080 1081 1082/* 1083 * Function to copy the from the ring buffer the 'struct blkif_request' 1084 * (which has the sectors we want, number of them, grant references, etc), 1085 * and transmute it to the block API to hand it over to the proper block disk. 1086 */ 1087static int 1088__do_block_io_op(struct xen_blkif *blkif) 1089{ 1090 union blkif_back_rings *blk_rings = &blkif->blk_rings; 1091 struct blkif_request req; 1092 struct pending_req *pending_req; 1093 RING_IDX rc, rp; 1094 int more_to_do = 0; 1095 1096 rc = blk_rings->common.req_cons; 1097 rp = blk_rings->common.sring->req_prod; 1098 rmb(); /* Ensure we see queued requests up to 'rp'. */ 1099 1100 if (RING_REQUEST_PROD_OVERFLOW(&blk_rings->common, rp)) { 1101 rc = blk_rings->common.rsp_prod_pvt; 1102 pr_warn("Frontend provided bogus ring requests (%d - %d = %d). Halting ring processing on dev=%04x\n", 1103 rp, rc, rp - rc, blkif->vbd.pdevice); 1104 return -EACCES; 1105 } 1106 while (rc != rp) { 1107 1108 if (RING_REQUEST_CONS_OVERFLOW(&blk_rings->common, rc)) 1109 break; 1110 1111 if (kthread_should_stop()) { 1112 more_to_do = 1; 1113 break; 1114 } 1115 1116 pending_req = alloc_req(blkif); 1117 if (NULL == pending_req) { 1118 blkif->st_oo_req++; 1119 more_to_do = 1; 1120 break; 1121 } 1122 1123 switch (blkif->blk_protocol) { 1124 case BLKIF_PROTOCOL_NATIVE: 1125 memcpy(&req, RING_GET_REQUEST(&blk_rings->native, rc), sizeof(req)); 1126 break; 1127 case BLKIF_PROTOCOL_X86_32: 1128 blkif_get_x86_32_req(&req, RING_GET_REQUEST(&blk_rings->x86_32, rc)); 1129 break; 1130 case BLKIF_PROTOCOL_X86_64: 1131 blkif_get_x86_64_req(&req, RING_GET_REQUEST(&blk_rings->x86_64, rc)); 1132 break; 1133 default: 1134 BUG(); 1135 } 1136 blk_rings->common.req_cons = ++rc; /* before make_response() */ 1137 1138 /* Apply all sanity checks to /private copy/ of request. */ 1139 barrier(); 1140 1141 switch (req.operation) { 1142 case BLKIF_OP_READ: 1143 case BLKIF_OP_WRITE: 1144 case BLKIF_OP_WRITE_BARRIER: 1145 case BLKIF_OP_FLUSH_DISKCACHE: 1146 case BLKIF_OP_INDIRECT: 1147 if (dispatch_rw_block_io(blkif, &req, pending_req)) 1148 goto done; 1149 break; 1150 case BLKIF_OP_DISCARD: 1151 free_req(blkif, pending_req); 1152 if (dispatch_discard_io(blkif, &req)) 1153 goto done; 1154 break; 1155 default: 1156 if (dispatch_other_io(blkif, &req, pending_req)) 1157 goto done; 1158 break; 1159 } 1160 1161 /* Yield point for this unbounded loop. */ 1162 cond_resched(); 1163 } 1164done: 1165 return more_to_do; 1166} 1167 1168static int 1169do_block_io_op(struct xen_blkif *blkif) 1170{ 1171 union blkif_back_rings *blk_rings = &blkif->blk_rings; 1172 int more_to_do; 1173 1174 do { 1175 more_to_do = __do_block_io_op(blkif); 1176 if (more_to_do) 1177 break; 1178 1179 RING_FINAL_CHECK_FOR_REQUESTS(&blk_rings->common, more_to_do); 1180 } while (more_to_do); 1181 1182 return more_to_do; 1183} 1184/* 1185 * Transmutation of the 'struct blkif_request' to a proper 'struct bio' 1186 * and call the 'submit_bio' to pass it to the underlying storage. 1187 */ 1188static int dispatch_rw_block_io(struct xen_blkif *blkif, 1189 struct blkif_request *req, 1190 struct pending_req *pending_req) 1191{ 1192 struct phys_req preq; 1193 struct seg_buf *seg = pending_req->seg; 1194 unsigned int nseg; 1195 struct bio *bio = NULL; 1196 struct bio **biolist = pending_req->biolist; 1197 int i, nbio = 0; 1198 int operation; 1199 struct blk_plug plug; 1200 bool drain = false; 1201 struct grant_page **pages = pending_req->segments; 1202 unsigned short req_operation; 1203 1204 req_operation = req->operation == BLKIF_OP_INDIRECT ? 1205 req->u.indirect.indirect_op : req->operation; 1206 if ((req->operation == BLKIF_OP_INDIRECT) && 1207 (req_operation != BLKIF_OP_READ) && 1208 (req_operation != BLKIF_OP_WRITE)) { 1209 pr_debug("Invalid indirect operation (%u)\n", req_operation); 1210 goto fail_response; 1211 } 1212 1213 switch (req_operation) { 1214 case BLKIF_OP_READ: 1215 blkif->st_rd_req++; 1216 operation = READ; 1217 break; 1218 case BLKIF_OP_WRITE: 1219 blkif->st_wr_req++; 1220 operation = WRITE_ODIRECT; 1221 break; 1222 case BLKIF_OP_WRITE_BARRIER: 1223 drain = true; 1224 case BLKIF_OP_FLUSH_DISKCACHE: 1225 blkif->st_f_req++; 1226 operation = WRITE_FLUSH; 1227 break; 1228 default: 1229 operation = 0; /* make gcc happy */ 1230 goto fail_response; 1231 break; 1232 } 1233 1234 /* Check that the number of segments is sane. */ 1235 nseg = req->operation == BLKIF_OP_INDIRECT ? 1236 req->u.indirect.nr_segments : req->u.rw.nr_segments; 1237 1238 if (unlikely(nseg == 0 && operation != WRITE_FLUSH) || 1239 unlikely((req->operation != BLKIF_OP_INDIRECT) && 1240 (nseg > BLKIF_MAX_SEGMENTS_PER_REQUEST)) || 1241 unlikely((req->operation == BLKIF_OP_INDIRECT) && 1242 (nseg > MAX_INDIRECT_SEGMENTS))) { 1243 pr_debug("Bad number of segments in request (%d)\n", nseg); 1244 /* Haven't submitted any bio's yet. */ 1245 goto fail_response; 1246 } 1247 1248 preq.nr_sects = 0; 1249 1250 pending_req->blkif = blkif; 1251 pending_req->id = req->u.rw.id; 1252 pending_req->operation = req_operation; 1253 pending_req->status = BLKIF_RSP_OKAY; 1254 pending_req->nr_pages = nseg; 1255 1256 if (req->operation != BLKIF_OP_INDIRECT) { 1257 preq.dev = req->u.rw.handle; 1258 preq.sector_number = req->u.rw.sector_number; 1259 for (i = 0; i < nseg; i++) { 1260 pages[i]->gref = req->u.rw.seg[i].gref; 1261 seg[i].nsec = req->u.rw.seg[i].last_sect - 1262 req->u.rw.seg[i].first_sect + 1; 1263 seg[i].offset = (req->u.rw.seg[i].first_sect << 9); 1264 if ((req->u.rw.seg[i].last_sect >= (PAGE_SIZE >> 9)) || 1265 (req->u.rw.seg[i].last_sect < 1266 req->u.rw.seg[i].first_sect)) 1267 goto fail_response; 1268 preq.nr_sects += seg[i].nsec; 1269 } 1270 } else { 1271 preq.dev = req->u.indirect.handle; 1272 preq.sector_number = req->u.indirect.sector_number; 1273 if (xen_blkbk_parse_indirect(req, pending_req, seg, &preq)) 1274 goto fail_response; 1275 } 1276 1277 if (xen_vbd_translate(&preq, blkif, operation) != 0) { 1278 pr_debug("access denied: %s of [%llu,%llu] on dev=%04x\n", 1279 operation == READ ? "read" : "write", 1280 preq.sector_number, 1281 preq.sector_number + preq.nr_sects, 1282 blkif->vbd.pdevice); 1283 goto fail_response; 1284 } 1285 1286 /* 1287 * This check _MUST_ be done after xen_vbd_translate as the preq.bdev 1288 * is set there. 1289 */ 1290 for (i = 0; i < nseg; i++) { 1291 if (((int)preq.sector_number|(int)seg[i].nsec) & 1292 ((bdev_logical_block_size(preq.bdev) >> 9) - 1)) { 1293 pr_debug("Misaligned I/O request from domain %d\n", 1294 blkif->domid); 1295 goto fail_response; 1296 } 1297 } 1298 1299 /* Wait on all outstanding I/O's and once that has been completed 1300 * issue the WRITE_FLUSH. 1301 */ 1302 if (drain) 1303 xen_blk_drain_io(pending_req->blkif); 1304 1305 /* 1306 * If we have failed at this point, we need to undo the M2P override, 1307 * set gnttab_set_unmap_op on all of the grant references and perform 1308 * the hypercall to unmap the grants - that is all done in 1309 * xen_blkbk_unmap. 1310 */ 1311 if (xen_blkbk_map_seg(pending_req)) 1312 goto fail_flush; 1313 1314 /* 1315 * This corresponding xen_blkif_put is done in __end_block_io_op, or 1316 * below (in "!bio") if we are handling a BLKIF_OP_DISCARD. 1317 */ 1318 xen_blkif_get(blkif); 1319 atomic_inc(&blkif->inflight); 1320 1321 for (i = 0; i < nseg; i++) { 1322 while ((bio == NULL) || 1323 (bio_add_page(bio, 1324 pages[i]->page, 1325 seg[i].nsec << 9, 1326 seg[i].offset) == 0)) { 1327 1328 int nr_iovecs = min_t(int, (nseg-i), BIO_MAX_PAGES); 1329 bio = bio_alloc(GFP_KERNEL, nr_iovecs); 1330 if (unlikely(bio == NULL)) 1331 goto fail_put_bio; 1332 1333 biolist[nbio++] = bio; 1334 bio->bi_bdev = preq.bdev; 1335 bio->bi_private = pending_req; 1336 bio->bi_end_io = end_block_io_op; 1337 bio->bi_iter.bi_sector = preq.sector_number; 1338 } 1339 1340 preq.sector_number += seg[i].nsec; 1341 } 1342 1343 /* This will be hit if the operation was a flush or discard. */ 1344 if (!bio) { 1345 BUG_ON(operation != WRITE_FLUSH); 1346 1347 bio = bio_alloc(GFP_KERNEL, 0); 1348 if (unlikely(bio == NULL)) 1349 goto fail_put_bio; 1350 1351 biolist[nbio++] = bio; 1352 bio->bi_bdev = preq.bdev; 1353 bio->bi_private = pending_req; 1354 bio->bi_end_io = end_block_io_op; 1355 } 1356 1357 atomic_set(&pending_req->pendcnt, nbio); 1358 blk_start_plug(&plug); 1359 1360 for (i = 0; i < nbio; i++) 1361 submit_bio(operation, biolist[i]); 1362 1363 /* Let the I/Os go.. */ 1364 blk_finish_plug(&plug); 1365 1366 if (operation == READ) 1367 blkif->st_rd_sect += preq.nr_sects; 1368 else if (operation & WRITE) 1369 blkif->st_wr_sect += preq.nr_sects; 1370 1371 return 0; 1372 1373 fail_flush: 1374 xen_blkbk_unmap(blkif, pending_req->segments, 1375 pending_req->nr_pages); 1376 fail_response: 1377 /* Haven't submitted any bio's yet. */ 1378 make_response(blkif, req->u.rw.id, req_operation, BLKIF_RSP_ERROR); 1379 free_req(blkif, pending_req); 1380 msleep(1); /* back off a bit */ 1381 return -EIO; 1382 1383 fail_put_bio: 1384 for (i = 0; i < nbio; i++) 1385 bio_put(biolist[i]); 1386 atomic_set(&pending_req->pendcnt, 1); 1387 __end_block_io_op(pending_req, -EINVAL); 1388 msleep(1); /* back off a bit */ 1389 return -EIO; 1390} 1391 1392 1393 1394/* 1395 * Put a response on the ring on how the operation fared. 1396 */ 1397static void make_response(struct xen_blkif *blkif, u64 id, 1398 unsigned short op, int st) 1399{ 1400 struct blkif_response resp; 1401 unsigned long flags; 1402 union blkif_back_rings *blk_rings = &blkif->blk_rings; 1403 int notify; 1404 1405 resp.id = id; 1406 resp.operation = op; 1407 resp.status = st; 1408 1409 spin_lock_irqsave(&blkif->blk_ring_lock, flags); 1410 /* Place on the response ring for the relevant domain. */ 1411 switch (blkif->blk_protocol) { 1412 case BLKIF_PROTOCOL_NATIVE: 1413 memcpy(RING_GET_RESPONSE(&blk_rings->native, blk_rings->native.rsp_prod_pvt), 1414 &resp, sizeof(resp)); 1415 break; 1416 case BLKIF_PROTOCOL_X86_32: 1417 memcpy(RING_GET_RESPONSE(&blk_rings->x86_32, blk_rings->x86_32.rsp_prod_pvt), 1418 &resp, sizeof(resp)); 1419 break; 1420 case BLKIF_PROTOCOL_X86_64: 1421 memcpy(RING_GET_RESPONSE(&blk_rings->x86_64, blk_rings->x86_64.rsp_prod_pvt), 1422 &resp, sizeof(resp)); 1423 break; 1424 default: 1425 BUG(); 1426 } 1427 blk_rings->common.rsp_prod_pvt++; 1428 RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&blk_rings->common, notify); 1429 spin_unlock_irqrestore(&blkif->blk_ring_lock, flags); 1430 if (notify) 1431 notify_remote_via_irq(blkif->irq); 1432} 1433 1434static int __init xen_blkif_init(void) 1435{ 1436 int rc = 0; 1437 1438 if (!xen_domain()) 1439 return -ENODEV; 1440 1441 rc = xen_blkif_interface_init(); 1442 if (rc) 1443 goto failed_init; 1444 1445 rc = xen_blkif_xenbus_init(); 1446 if (rc) 1447 goto failed_init; 1448 1449 failed_init: 1450 return rc; 1451} 1452 1453module_init(xen_blkif_init); 1454 1455MODULE_LICENSE("Dual BSD/GPL"); 1456MODULE_ALIAS("xen-backend:vbd"); 1457