1/*
2   drbd_worker.c
3
4   This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
5
6   Copyright (C) 2001-2008, LINBIT Information Technologies GmbH.
7   Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>.
8   Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
9
10   drbd is free software; you can redistribute it and/or modify
11   it under the terms of the GNU General Public License as published by
12   the Free Software Foundation; either version 2, or (at your option)
13   any later version.
14
15   drbd is distributed in the hope that it will be useful,
16   but WITHOUT ANY WARRANTY; without even the implied warranty of
17   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18   GNU General Public License for more details.
19
20   You should have received a copy of the GNU General Public License
21   along with drbd; see the file COPYING.  If not, write to
22   the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
23
24*/
25
26#include <linux/module.h>
27#include <linux/drbd.h>
28#include <linux/sched.h>
29#include <linux/wait.h>
30#include <linux/mm.h>
31#include <linux/memcontrol.h>
32#include <linux/mm_inline.h>
33#include <linux/slab.h>
34#include <linux/random.h>
35#include <linux/string.h>
36#include <linux/scatterlist.h>
37
38#include "drbd_int.h"
39#include "drbd_protocol.h"
40#include "drbd_req.h"
41
42static int make_ov_request(struct drbd_device *, int);
43static int make_resync_request(struct drbd_device *, int);
44
45/* endio handlers:
46 *   drbd_md_endio (defined here)
47 *   drbd_request_endio (defined here)
48 *   drbd_peer_request_endio (defined here)
49 *   drbd_bm_endio (defined in drbd_bitmap.c)
50 *
51 * For all these callbacks, note the following:
52 * The callbacks will be called in irq context by the IDE drivers,
53 * and in Softirqs/Tasklets/BH context by the SCSI drivers.
54 * Try to get the locking right :)
55 *
56 */
57
58
59/* About the global_state_lock
60   Each state transition on an device holds a read lock. In case we have
61   to evaluate the resync after dependencies, we grab a write lock, because
62   we need stable states on all devices for that.  */
63rwlock_t global_state_lock;
64
65/* used for synchronous meta data and bitmap IO
66 * submitted by drbd_md_sync_page_io()
67 */
68void drbd_md_endio(struct bio *bio)
69{
70	struct drbd_device *device;
71
72	device = bio->bi_private;
73	device->md_io.error = bio->bi_error;
74
75	/* We grabbed an extra reference in _drbd_md_sync_page_io() to be able
76	 * to timeout on the lower level device, and eventually detach from it.
77	 * If this io completion runs after that timeout expired, this
78	 * drbd_md_put_buffer() may allow us to finally try and re-attach.
79	 * During normal operation, this only puts that extra reference
80	 * down to 1 again.
81	 * Make sure we first drop the reference, and only then signal
82	 * completion, or we may (in drbd_al_read_log()) cycle so fast into the
83	 * next drbd_md_sync_page_io(), that we trigger the
84	 * ASSERT(atomic_read(&device->md_io_in_use) == 1) there.
85	 */
86	drbd_md_put_buffer(device);
87	device->md_io.done = 1;
88	wake_up(&device->misc_wait);
89	bio_put(bio);
90	if (device->ldev) /* special case: drbd_md_read() during drbd_adm_attach() */
91		put_ldev(device);
92}
93
94/* reads on behalf of the partner,
95 * "submitted" by the receiver
96 */
97static void drbd_endio_read_sec_final(struct drbd_peer_request *peer_req) __releases(local)
98{
99	unsigned long flags = 0;
100	struct drbd_peer_device *peer_device = peer_req->peer_device;
101	struct drbd_device *device = peer_device->device;
102
103	spin_lock_irqsave(&device->resource->req_lock, flags);
104	device->read_cnt += peer_req->i.size >> 9;
105	list_del(&peer_req->w.list);
106	if (list_empty(&device->read_ee))
107		wake_up(&device->ee_wait);
108	if (test_bit(__EE_WAS_ERROR, &peer_req->flags))
109		__drbd_chk_io_error(device, DRBD_READ_ERROR);
110	spin_unlock_irqrestore(&device->resource->req_lock, flags);
111
112	drbd_queue_work(&peer_device->connection->sender_work, &peer_req->w);
113	put_ldev(device);
114}
115
116/* writes on behalf of the partner, or resync writes,
117 * "submitted" by the receiver, final stage.  */
118void drbd_endio_write_sec_final(struct drbd_peer_request *peer_req) __releases(local)
119{
120	unsigned long flags = 0;
121	struct drbd_peer_device *peer_device = peer_req->peer_device;
122	struct drbd_device *device = peer_device->device;
123	struct drbd_interval i;
124	int do_wake;
125	u64 block_id;
126	int do_al_complete_io;
127
128	/* after we moved peer_req to done_ee,
129	 * we may no longer access it,
130	 * it may be freed/reused already!
131	 * (as soon as we release the req_lock) */
132	i = peer_req->i;
133	do_al_complete_io = peer_req->flags & EE_CALL_AL_COMPLETE_IO;
134	block_id = peer_req->block_id;
135	peer_req->flags &= ~EE_CALL_AL_COMPLETE_IO;
136
137	spin_lock_irqsave(&device->resource->req_lock, flags);
138	device->writ_cnt += peer_req->i.size >> 9;
139	list_move_tail(&peer_req->w.list, &device->done_ee);
140
141	/*
142	 * Do not remove from the write_requests tree here: we did not send the
143	 * Ack yet and did not wake possibly waiting conflicting requests.
144	 * Removed from the tree from "drbd_process_done_ee" within the
145	 * appropriate dw.cb (e_end_block/e_end_resync_block) or from
146	 * _drbd_clear_done_ee.
147	 */
148
149	do_wake = list_empty(block_id == ID_SYNCER ? &device->sync_ee : &device->active_ee);
150
151	/* FIXME do we want to detach for failed REQ_DISCARD?
152	 * ((peer_req->flags & (EE_WAS_ERROR|EE_IS_TRIM)) == EE_WAS_ERROR) */
153	if (peer_req->flags & EE_WAS_ERROR)
154		__drbd_chk_io_error(device, DRBD_WRITE_ERROR);
155	spin_unlock_irqrestore(&device->resource->req_lock, flags);
156
157	if (block_id == ID_SYNCER)
158		drbd_rs_complete_io(device, i.sector);
159
160	if (do_wake)
161		wake_up(&device->ee_wait);
162
163	if (do_al_complete_io)
164		drbd_al_complete_io(device, &i);
165
166	wake_asender(peer_device->connection);
167	put_ldev(device);
168}
169
170/* writes on behalf of the partner, or resync writes,
171 * "submitted" by the receiver.
172 */
173void drbd_peer_request_endio(struct bio *bio)
174{
175	struct drbd_peer_request *peer_req = bio->bi_private;
176	struct drbd_device *device = peer_req->peer_device->device;
177	int is_write = bio_data_dir(bio) == WRITE;
178	int is_discard = !!(bio->bi_rw & REQ_DISCARD);
179
180	if (bio->bi_error && __ratelimit(&drbd_ratelimit_state))
181		drbd_warn(device, "%s: error=%d s=%llus\n",
182				is_write ? (is_discard ? "discard" : "write")
183					: "read", bio->bi_error,
184				(unsigned long long)peer_req->i.sector);
185
186	if (bio->bi_error)
187		set_bit(__EE_WAS_ERROR, &peer_req->flags);
188
189	bio_put(bio); /* no need for the bio anymore */
190	if (atomic_dec_and_test(&peer_req->pending_bios)) {
191		if (is_write)
192			drbd_endio_write_sec_final(peer_req);
193		else
194			drbd_endio_read_sec_final(peer_req);
195	}
196}
197
198/* read, readA or write requests on R_PRIMARY coming from drbd_make_request
199 */
200void drbd_request_endio(struct bio *bio)
201{
202	unsigned long flags;
203	struct drbd_request *req = bio->bi_private;
204	struct drbd_device *device = req->device;
205	struct bio_and_error m;
206	enum drbd_req_event what;
207
208	/* If this request was aborted locally before,
209	 * but now was completed "successfully",
210	 * chances are that this caused arbitrary data corruption.
211	 *
212	 * "aborting" requests, or force-detaching the disk, is intended for
213	 * completely blocked/hung local backing devices which do no longer
214	 * complete requests at all, not even do error completions.  In this
215	 * situation, usually a hard-reset and failover is the only way out.
216	 *
217	 * By "aborting", basically faking a local error-completion,
218	 * we allow for a more graceful swichover by cleanly migrating services.
219	 * Still the affected node has to be rebooted "soon".
220	 *
221	 * By completing these requests, we allow the upper layers to re-use
222	 * the associated data pages.
223	 *
224	 * If later the local backing device "recovers", and now DMAs some data
225	 * from disk into the original request pages, in the best case it will
226	 * just put random data into unused pages; but typically it will corrupt
227	 * meanwhile completely unrelated data, causing all sorts of damage.
228	 *
229	 * Which means delayed successful completion,
230	 * especially for READ requests,
231	 * is a reason to panic().
232	 *
233	 * We assume that a delayed *error* completion is OK,
234	 * though we still will complain noisily about it.
235	 */
236	if (unlikely(req->rq_state & RQ_LOCAL_ABORTED)) {
237		if (__ratelimit(&drbd_ratelimit_state))
238			drbd_emerg(device, "delayed completion of aborted local request; disk-timeout may be too aggressive\n");
239
240		if (!bio->bi_error)
241			panic("possible random memory corruption caused by delayed completion of aborted local request\n");
242	}
243
244	/* to avoid recursion in __req_mod */
245	if (unlikely(bio->bi_error)) {
246		if (bio->bi_rw & REQ_DISCARD)
247			what = (bio->bi_error == -EOPNOTSUPP)
248				? DISCARD_COMPLETED_NOTSUPP
249				: DISCARD_COMPLETED_WITH_ERROR;
250		else
251			what = (bio_data_dir(bio) == WRITE)
252			? WRITE_COMPLETED_WITH_ERROR
253			: (bio_rw(bio) == READ)
254			  ? READ_COMPLETED_WITH_ERROR
255			  : READ_AHEAD_COMPLETED_WITH_ERROR;
256	} else
257		what = COMPLETED_OK;
258
259	bio_put(req->private_bio);
260	req->private_bio = ERR_PTR(bio->bi_error);
261
262	/* not req_mod(), we need irqsave here! */
263	spin_lock_irqsave(&device->resource->req_lock, flags);
264	__req_mod(req, what, &m);
265	spin_unlock_irqrestore(&device->resource->req_lock, flags);
266	put_ldev(device);
267
268	if (m.bio)
269		complete_master_bio(device, &m);
270}
271
272void drbd_csum_ee(struct crypto_hash *tfm, struct drbd_peer_request *peer_req, void *digest)
273{
274	struct hash_desc desc;
275	struct scatterlist sg;
276	struct page *page = peer_req->pages;
277	struct page *tmp;
278	unsigned len;
279
280	desc.tfm = tfm;
281	desc.flags = 0;
282
283	sg_init_table(&sg, 1);
284	crypto_hash_init(&desc);
285
286	while ((tmp = page_chain_next(page))) {
287		/* all but the last page will be fully used */
288		sg_set_page(&sg, page, PAGE_SIZE, 0);
289		crypto_hash_update(&desc, &sg, sg.length);
290		page = tmp;
291	}
292	/* and now the last, possibly only partially used page */
293	len = peer_req->i.size & (PAGE_SIZE - 1);
294	sg_set_page(&sg, page, len ?: PAGE_SIZE, 0);
295	crypto_hash_update(&desc, &sg, sg.length);
296	crypto_hash_final(&desc, digest);
297}
298
299void drbd_csum_bio(struct crypto_hash *tfm, struct bio *bio, void *digest)
300{
301	struct hash_desc desc;
302	struct scatterlist sg;
303	struct bio_vec bvec;
304	struct bvec_iter iter;
305
306	desc.tfm = tfm;
307	desc.flags = 0;
308
309	sg_init_table(&sg, 1);
310	crypto_hash_init(&desc);
311
312	bio_for_each_segment(bvec, bio, iter) {
313		sg_set_page(&sg, bvec.bv_page, bvec.bv_len, bvec.bv_offset);
314		crypto_hash_update(&desc, &sg, sg.length);
315	}
316	crypto_hash_final(&desc, digest);
317}
318
319/* MAYBE merge common code with w_e_end_ov_req */
320static int w_e_send_csum(struct drbd_work *w, int cancel)
321{
322	struct drbd_peer_request *peer_req = container_of(w, struct drbd_peer_request, w);
323	struct drbd_peer_device *peer_device = peer_req->peer_device;
324	struct drbd_device *device = peer_device->device;
325	int digest_size;
326	void *digest;
327	int err = 0;
328
329	if (unlikely(cancel))
330		goto out;
331
332	if (unlikely((peer_req->flags & EE_WAS_ERROR) != 0))
333		goto out;
334
335	digest_size = crypto_hash_digestsize(peer_device->connection->csums_tfm);
336	digest = kmalloc(digest_size, GFP_NOIO);
337	if (digest) {
338		sector_t sector = peer_req->i.sector;
339		unsigned int size = peer_req->i.size;
340		drbd_csum_ee(peer_device->connection->csums_tfm, peer_req, digest);
341		/* Free peer_req and pages before send.
342		 * In case we block on congestion, we could otherwise run into
343		 * some distributed deadlock, if the other side blocks on
344		 * congestion as well, because our receiver blocks in
345		 * drbd_alloc_pages due to pp_in_use > max_buffers. */
346		drbd_free_peer_req(device, peer_req);
347		peer_req = NULL;
348		inc_rs_pending(device);
349		err = drbd_send_drequest_csum(peer_device, sector, size,
350					      digest, digest_size,
351					      P_CSUM_RS_REQUEST);
352		kfree(digest);
353	} else {
354		drbd_err(device, "kmalloc() of digest failed.\n");
355		err = -ENOMEM;
356	}
357
358out:
359	if (peer_req)
360		drbd_free_peer_req(device, peer_req);
361
362	if (unlikely(err))
363		drbd_err(device, "drbd_send_drequest(..., csum) failed\n");
364	return err;
365}
366
367#define GFP_TRY	(__GFP_HIGHMEM | __GFP_NOWARN)
368
369static int read_for_csum(struct drbd_peer_device *peer_device, sector_t sector, int size)
370{
371	struct drbd_device *device = peer_device->device;
372	struct drbd_peer_request *peer_req;
373
374	if (!get_ldev(device))
375		return -EIO;
376
377	/* GFP_TRY, because if there is no memory available right now, this may
378	 * be rescheduled for later. It is "only" background resync, after all. */
379	peer_req = drbd_alloc_peer_req(peer_device, ID_SYNCER /* unused */, sector,
380				       size, true /* has real payload */, GFP_TRY);
381	if (!peer_req)
382		goto defer;
383
384	peer_req->w.cb = w_e_send_csum;
385	spin_lock_irq(&device->resource->req_lock);
386	list_add_tail(&peer_req->w.list, &device->read_ee);
387	spin_unlock_irq(&device->resource->req_lock);
388
389	atomic_add(size >> 9, &device->rs_sect_ev);
390	if (drbd_submit_peer_request(device, peer_req, READ, DRBD_FAULT_RS_RD) == 0)
391		return 0;
392
393	/* If it failed because of ENOMEM, retry should help.  If it failed
394	 * because bio_add_page failed (probably broken lower level driver),
395	 * retry may or may not help.
396	 * If it does not, you may need to force disconnect. */
397	spin_lock_irq(&device->resource->req_lock);
398	list_del(&peer_req->w.list);
399	spin_unlock_irq(&device->resource->req_lock);
400
401	drbd_free_peer_req(device, peer_req);
402defer:
403	put_ldev(device);
404	return -EAGAIN;
405}
406
407int w_resync_timer(struct drbd_work *w, int cancel)
408{
409	struct drbd_device *device =
410		container_of(w, struct drbd_device, resync_work);
411
412	switch (device->state.conn) {
413	case C_VERIFY_S:
414		make_ov_request(device, cancel);
415		break;
416	case C_SYNC_TARGET:
417		make_resync_request(device, cancel);
418		break;
419	}
420
421	return 0;
422}
423
424void resync_timer_fn(unsigned long data)
425{
426	struct drbd_device *device = (struct drbd_device *) data;
427
428	drbd_queue_work_if_unqueued(
429		&first_peer_device(device)->connection->sender_work,
430		&device->resync_work);
431}
432
433static void fifo_set(struct fifo_buffer *fb, int value)
434{
435	int i;
436
437	for (i = 0; i < fb->size; i++)
438		fb->values[i] = value;
439}
440
441static int fifo_push(struct fifo_buffer *fb, int value)
442{
443	int ov;
444
445	ov = fb->values[fb->head_index];
446	fb->values[fb->head_index++] = value;
447
448	if (fb->head_index >= fb->size)
449		fb->head_index = 0;
450
451	return ov;
452}
453
454static void fifo_add_val(struct fifo_buffer *fb, int value)
455{
456	int i;
457
458	for (i = 0; i < fb->size; i++)
459		fb->values[i] += value;
460}
461
462struct fifo_buffer *fifo_alloc(int fifo_size)
463{
464	struct fifo_buffer *fb;
465
466	fb = kzalloc(sizeof(struct fifo_buffer) + sizeof(int) * fifo_size, GFP_NOIO);
467	if (!fb)
468		return NULL;
469
470	fb->head_index = 0;
471	fb->size = fifo_size;
472	fb->total = 0;
473
474	return fb;
475}
476
477static int drbd_rs_controller(struct drbd_device *device, unsigned int sect_in)
478{
479	struct disk_conf *dc;
480	unsigned int want;     /* The number of sectors we want in-flight */
481	int req_sect; /* Number of sectors to request in this turn */
482	int correction; /* Number of sectors more we need in-flight */
483	int cps; /* correction per invocation of drbd_rs_controller() */
484	int steps; /* Number of time steps to plan ahead */
485	int curr_corr;
486	int max_sect;
487	struct fifo_buffer *plan;
488
489	dc = rcu_dereference(device->ldev->disk_conf);
490	plan = rcu_dereference(device->rs_plan_s);
491
492	steps = plan->size; /* (dc->c_plan_ahead * 10 * SLEEP_TIME) / HZ; */
493
494	if (device->rs_in_flight + sect_in == 0) { /* At start of resync */
495		want = ((dc->resync_rate * 2 * SLEEP_TIME) / HZ) * steps;
496	} else { /* normal path */
497		want = dc->c_fill_target ? dc->c_fill_target :
498			sect_in * dc->c_delay_target * HZ / (SLEEP_TIME * 10);
499	}
500
501	correction = want - device->rs_in_flight - plan->total;
502
503	/* Plan ahead */
504	cps = correction / steps;
505	fifo_add_val(plan, cps);
506	plan->total += cps * steps;
507
508	/* What we do in this step */
509	curr_corr = fifo_push(plan, 0);
510	plan->total -= curr_corr;
511
512	req_sect = sect_in + curr_corr;
513	if (req_sect < 0)
514		req_sect = 0;
515
516	max_sect = (dc->c_max_rate * 2 * SLEEP_TIME) / HZ;
517	if (req_sect > max_sect)
518		req_sect = max_sect;
519
520	/*
521	drbd_warn(device, "si=%u if=%d wa=%u co=%d st=%d cps=%d pl=%d cc=%d rs=%d\n",
522		 sect_in, device->rs_in_flight, want, correction,
523		 steps, cps, device->rs_planed, curr_corr, req_sect);
524	*/
525
526	return req_sect;
527}
528
529static int drbd_rs_number_requests(struct drbd_device *device)
530{
531	unsigned int sect_in;  /* Number of sectors that came in since the last turn */
532	int number, mxb;
533
534	sect_in = atomic_xchg(&device->rs_sect_in, 0);
535	device->rs_in_flight -= sect_in;
536
537	rcu_read_lock();
538	mxb = drbd_get_max_buffers(device) / 2;
539	if (rcu_dereference(device->rs_plan_s)->size) {
540		number = drbd_rs_controller(device, sect_in) >> (BM_BLOCK_SHIFT - 9);
541		device->c_sync_rate = number * HZ * (BM_BLOCK_SIZE / 1024) / SLEEP_TIME;
542	} else {
543		device->c_sync_rate = rcu_dereference(device->ldev->disk_conf)->resync_rate;
544		number = SLEEP_TIME * device->c_sync_rate  / ((BM_BLOCK_SIZE / 1024) * HZ);
545	}
546	rcu_read_unlock();
547
548	/* Don't have more than "max-buffers"/2 in-flight.
549	 * Otherwise we may cause the remote site to stall on drbd_alloc_pages(),
550	 * potentially causing a distributed deadlock on congestion during
551	 * online-verify or (checksum-based) resync, if max-buffers,
552	 * socket buffer sizes and resync rate settings are mis-configured. */
553
554	/* note that "number" is in units of "BM_BLOCK_SIZE" (which is 4k),
555	 * mxb (as used here, and in drbd_alloc_pages on the peer) is
556	 * "number of pages" (typically also 4k),
557	 * but "rs_in_flight" is in "sectors" (512 Byte). */
558	if (mxb - device->rs_in_flight/8 < number)
559		number = mxb - device->rs_in_flight/8;
560
561	return number;
562}
563
564static int make_resync_request(struct drbd_device *const device, int cancel)
565{
566	struct drbd_peer_device *const peer_device = first_peer_device(device);
567	struct drbd_connection *const connection = peer_device ? peer_device->connection : NULL;
568	unsigned long bit;
569	sector_t sector;
570	const sector_t capacity = drbd_get_capacity(device->this_bdev);
571	int max_bio_size;
572	int number, rollback_i, size;
573	int align, requeue = 0;
574	int i = 0;
575
576	if (unlikely(cancel))
577		return 0;
578
579	if (device->rs_total == 0) {
580		/* empty resync? */
581		drbd_resync_finished(device);
582		return 0;
583	}
584
585	if (!get_ldev(device)) {
586		/* Since we only need to access device->rsync a
587		   get_ldev_if_state(device,D_FAILED) would be sufficient, but
588		   to continue resync with a broken disk makes no sense at
589		   all */
590		drbd_err(device, "Disk broke down during resync!\n");
591		return 0;
592	}
593
594	max_bio_size = queue_max_hw_sectors(device->rq_queue) << 9;
595	number = drbd_rs_number_requests(device);
596	if (number <= 0)
597		goto requeue;
598
599	for (i = 0; i < number; i++) {
600		/* Stop generating RS requests when half of the send buffer is filled,
601		 * but notify TCP that we'd like to have more space. */
602		mutex_lock(&connection->data.mutex);
603		if (connection->data.socket) {
604			struct sock *sk = connection->data.socket->sk;
605			int queued = sk->sk_wmem_queued;
606			int sndbuf = sk->sk_sndbuf;
607			if (queued > sndbuf / 2) {
608				requeue = 1;
609				if (sk->sk_socket)
610					set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
611			}
612		} else
613			requeue = 1;
614		mutex_unlock(&connection->data.mutex);
615		if (requeue)
616			goto requeue;
617
618next_sector:
619		size = BM_BLOCK_SIZE;
620		bit  = drbd_bm_find_next(device, device->bm_resync_fo);
621
622		if (bit == DRBD_END_OF_BITMAP) {
623			device->bm_resync_fo = drbd_bm_bits(device);
624			put_ldev(device);
625			return 0;
626		}
627
628		sector = BM_BIT_TO_SECT(bit);
629
630		if (drbd_try_rs_begin_io(device, sector)) {
631			device->bm_resync_fo = bit;
632			goto requeue;
633		}
634		device->bm_resync_fo = bit + 1;
635
636		if (unlikely(drbd_bm_test_bit(device, bit) == 0)) {
637			drbd_rs_complete_io(device, sector);
638			goto next_sector;
639		}
640
641#if DRBD_MAX_BIO_SIZE > BM_BLOCK_SIZE
642		/* try to find some adjacent bits.
643		 * we stop if we have already the maximum req size.
644		 *
645		 * Additionally always align bigger requests, in order to
646		 * be prepared for all stripe sizes of software RAIDs.
647		 */
648		align = 1;
649		rollback_i = i;
650		while (i < number) {
651			if (size + BM_BLOCK_SIZE > max_bio_size)
652				break;
653
654			/* Be always aligned */
655			if (sector & ((1<<(align+3))-1))
656				break;
657
658			/* do not cross extent boundaries */
659			if (((bit+1) & BM_BLOCKS_PER_BM_EXT_MASK) == 0)
660				break;
661			/* now, is it actually dirty, after all?
662			 * caution, drbd_bm_test_bit is tri-state for some
663			 * obscure reason; ( b == 0 ) would get the out-of-band
664			 * only accidentally right because of the "oddly sized"
665			 * adjustment below */
666			if (drbd_bm_test_bit(device, bit+1) != 1)
667				break;
668			bit++;
669			size += BM_BLOCK_SIZE;
670			if ((BM_BLOCK_SIZE << align) <= size)
671				align++;
672			i++;
673		}
674		/* if we merged some,
675		 * reset the offset to start the next drbd_bm_find_next from */
676		if (size > BM_BLOCK_SIZE)
677			device->bm_resync_fo = bit + 1;
678#endif
679
680		/* adjust very last sectors, in case we are oddly sized */
681		if (sector + (size>>9) > capacity)
682			size = (capacity-sector)<<9;
683
684		if (device->use_csums) {
685			switch (read_for_csum(peer_device, sector, size)) {
686			case -EIO: /* Disk failure */
687				put_ldev(device);
688				return -EIO;
689			case -EAGAIN: /* allocation failed, or ldev busy */
690				drbd_rs_complete_io(device, sector);
691				device->bm_resync_fo = BM_SECT_TO_BIT(sector);
692				i = rollback_i;
693				goto requeue;
694			case 0:
695				/* everything ok */
696				break;
697			default:
698				BUG();
699			}
700		} else {
701			int err;
702
703			inc_rs_pending(device);
704			err = drbd_send_drequest(peer_device, P_RS_DATA_REQUEST,
705						 sector, size, ID_SYNCER);
706			if (err) {
707				drbd_err(device, "drbd_send_drequest() failed, aborting...\n");
708				dec_rs_pending(device);
709				put_ldev(device);
710				return err;
711			}
712		}
713	}
714
715	if (device->bm_resync_fo >= drbd_bm_bits(device)) {
716		/* last syncer _request_ was sent,
717		 * but the P_RS_DATA_REPLY not yet received.  sync will end (and
718		 * next sync group will resume), as soon as we receive the last
719		 * resync data block, and the last bit is cleared.
720		 * until then resync "work" is "inactive" ...
721		 */
722		put_ldev(device);
723		return 0;
724	}
725
726 requeue:
727	device->rs_in_flight += (i << (BM_BLOCK_SHIFT - 9));
728	mod_timer(&device->resync_timer, jiffies + SLEEP_TIME);
729	put_ldev(device);
730	return 0;
731}
732
733static int make_ov_request(struct drbd_device *device, int cancel)
734{
735	int number, i, size;
736	sector_t sector;
737	const sector_t capacity = drbd_get_capacity(device->this_bdev);
738	bool stop_sector_reached = false;
739
740	if (unlikely(cancel))
741		return 1;
742
743	number = drbd_rs_number_requests(device);
744
745	sector = device->ov_position;
746	for (i = 0; i < number; i++) {
747		if (sector >= capacity)
748			return 1;
749
750		/* We check for "finished" only in the reply path:
751		 * w_e_end_ov_reply().
752		 * We need to send at least one request out. */
753		stop_sector_reached = i > 0
754			&& verify_can_do_stop_sector(device)
755			&& sector >= device->ov_stop_sector;
756		if (stop_sector_reached)
757			break;
758
759		size = BM_BLOCK_SIZE;
760
761		if (drbd_try_rs_begin_io(device, sector)) {
762			device->ov_position = sector;
763			goto requeue;
764		}
765
766		if (sector + (size>>9) > capacity)
767			size = (capacity-sector)<<9;
768
769		inc_rs_pending(device);
770		if (drbd_send_ov_request(first_peer_device(device), sector, size)) {
771			dec_rs_pending(device);
772			return 0;
773		}
774		sector += BM_SECT_PER_BIT;
775	}
776	device->ov_position = sector;
777
778 requeue:
779	device->rs_in_flight += (i << (BM_BLOCK_SHIFT - 9));
780	if (i == 0 || !stop_sector_reached)
781		mod_timer(&device->resync_timer, jiffies + SLEEP_TIME);
782	return 1;
783}
784
785int w_ov_finished(struct drbd_work *w, int cancel)
786{
787	struct drbd_device_work *dw =
788		container_of(w, struct drbd_device_work, w);
789	struct drbd_device *device = dw->device;
790	kfree(dw);
791	ov_out_of_sync_print(device);
792	drbd_resync_finished(device);
793
794	return 0;
795}
796
797static int w_resync_finished(struct drbd_work *w, int cancel)
798{
799	struct drbd_device_work *dw =
800		container_of(w, struct drbd_device_work, w);
801	struct drbd_device *device = dw->device;
802	kfree(dw);
803
804	drbd_resync_finished(device);
805
806	return 0;
807}
808
809static void ping_peer(struct drbd_device *device)
810{
811	struct drbd_connection *connection = first_peer_device(device)->connection;
812
813	clear_bit(GOT_PING_ACK, &connection->flags);
814	request_ping(connection);
815	wait_event(connection->ping_wait,
816		   test_bit(GOT_PING_ACK, &connection->flags) || device->state.conn < C_CONNECTED);
817}
818
819int drbd_resync_finished(struct drbd_device *device)
820{
821	unsigned long db, dt, dbdt;
822	unsigned long n_oos;
823	union drbd_state os, ns;
824	struct drbd_device_work *dw;
825	char *khelper_cmd = NULL;
826	int verify_done = 0;
827
828	/* Remove all elements from the resync LRU. Since future actions
829	 * might set bits in the (main) bitmap, then the entries in the
830	 * resync LRU would be wrong. */
831	if (drbd_rs_del_all(device)) {
832		/* In case this is not possible now, most probably because
833		 * there are P_RS_DATA_REPLY Packets lingering on the worker's
834		 * queue (or even the read operations for those packets
835		 * is not finished by now).   Retry in 100ms. */
836
837		schedule_timeout_interruptible(HZ / 10);
838		dw = kmalloc(sizeof(struct drbd_device_work), GFP_ATOMIC);
839		if (dw) {
840			dw->w.cb = w_resync_finished;
841			dw->device = device;
842			drbd_queue_work(&first_peer_device(device)->connection->sender_work,
843					&dw->w);
844			return 1;
845		}
846		drbd_err(device, "Warn failed to drbd_rs_del_all() and to kmalloc(dw).\n");
847	}
848
849	dt = (jiffies - device->rs_start - device->rs_paused) / HZ;
850	if (dt <= 0)
851		dt = 1;
852
853	db = device->rs_total;
854	/* adjust for verify start and stop sectors, respective reached position */
855	if (device->state.conn == C_VERIFY_S || device->state.conn == C_VERIFY_T)
856		db -= device->ov_left;
857
858	dbdt = Bit2KB(db/dt);
859	device->rs_paused /= HZ;
860
861	if (!get_ldev(device))
862		goto out;
863
864	ping_peer(device);
865
866	spin_lock_irq(&device->resource->req_lock);
867	os = drbd_read_state(device);
868
869	verify_done = (os.conn == C_VERIFY_S || os.conn == C_VERIFY_T);
870
871	/* This protects us against multiple calls (that can happen in the presence
872	   of application IO), and against connectivity loss just before we arrive here. */
873	if (os.conn <= C_CONNECTED)
874		goto out_unlock;
875
876	ns = os;
877	ns.conn = C_CONNECTED;
878
879	drbd_info(device, "%s done (total %lu sec; paused %lu sec; %lu K/sec)\n",
880	     verify_done ? "Online verify" : "Resync",
881	     dt + device->rs_paused, device->rs_paused, dbdt);
882
883	n_oos = drbd_bm_total_weight(device);
884
885	if (os.conn == C_VERIFY_S || os.conn == C_VERIFY_T) {
886		if (n_oos) {
887			drbd_alert(device, "Online verify found %lu %dk block out of sync!\n",
888			      n_oos, Bit2KB(1));
889			khelper_cmd = "out-of-sync";
890		}
891	} else {
892		D_ASSERT(device, (n_oos - device->rs_failed) == 0);
893
894		if (os.conn == C_SYNC_TARGET || os.conn == C_PAUSED_SYNC_T)
895			khelper_cmd = "after-resync-target";
896
897		if (device->use_csums && device->rs_total) {
898			const unsigned long s = device->rs_same_csum;
899			const unsigned long t = device->rs_total;
900			const int ratio =
901				(t == 0)     ? 0 :
902			(t < 100000) ? ((s*100)/t) : (s/(t/100));
903			drbd_info(device, "%u %% had equal checksums, eliminated: %luK; "
904			     "transferred %luK total %luK\n",
905			     ratio,
906			     Bit2KB(device->rs_same_csum),
907			     Bit2KB(device->rs_total - device->rs_same_csum),
908			     Bit2KB(device->rs_total));
909		}
910	}
911
912	if (device->rs_failed) {
913		drbd_info(device, "            %lu failed blocks\n", device->rs_failed);
914
915		if (os.conn == C_SYNC_TARGET || os.conn == C_PAUSED_SYNC_T) {
916			ns.disk = D_INCONSISTENT;
917			ns.pdsk = D_UP_TO_DATE;
918		} else {
919			ns.disk = D_UP_TO_DATE;
920			ns.pdsk = D_INCONSISTENT;
921		}
922	} else {
923		ns.disk = D_UP_TO_DATE;
924		ns.pdsk = D_UP_TO_DATE;
925
926		if (os.conn == C_SYNC_TARGET || os.conn == C_PAUSED_SYNC_T) {
927			if (device->p_uuid) {
928				int i;
929				for (i = UI_BITMAP ; i <= UI_HISTORY_END ; i++)
930					_drbd_uuid_set(device, i, device->p_uuid[i]);
931				drbd_uuid_set(device, UI_BITMAP, device->ldev->md.uuid[UI_CURRENT]);
932				_drbd_uuid_set(device, UI_CURRENT, device->p_uuid[UI_CURRENT]);
933			} else {
934				drbd_err(device, "device->p_uuid is NULL! BUG\n");
935			}
936		}
937
938		if (!(os.conn == C_VERIFY_S || os.conn == C_VERIFY_T)) {
939			/* for verify runs, we don't update uuids here,
940			 * so there would be nothing to report. */
941			drbd_uuid_set_bm(device, 0UL);
942			drbd_print_uuids(device, "updated UUIDs");
943			if (device->p_uuid) {
944				/* Now the two UUID sets are equal, update what we
945				 * know of the peer. */
946				int i;
947				for (i = UI_CURRENT ; i <= UI_HISTORY_END ; i++)
948					device->p_uuid[i] = device->ldev->md.uuid[i];
949			}
950		}
951	}
952
953	_drbd_set_state(device, ns, CS_VERBOSE, NULL);
954out_unlock:
955	spin_unlock_irq(&device->resource->req_lock);
956	put_ldev(device);
957out:
958	device->rs_total  = 0;
959	device->rs_failed = 0;
960	device->rs_paused = 0;
961
962	/* reset start sector, if we reached end of device */
963	if (verify_done && device->ov_left == 0)
964		device->ov_start_sector = 0;
965
966	drbd_md_sync(device);
967
968	if (khelper_cmd)
969		drbd_khelper(device, khelper_cmd);
970
971	return 1;
972}
973
974/* helper */
975static void move_to_net_ee_or_free(struct drbd_device *device, struct drbd_peer_request *peer_req)
976{
977	if (drbd_peer_req_has_active_page(peer_req)) {
978		/* This might happen if sendpage() has not finished */
979		int i = (peer_req->i.size + PAGE_SIZE -1) >> PAGE_SHIFT;
980		atomic_add(i, &device->pp_in_use_by_net);
981		atomic_sub(i, &device->pp_in_use);
982		spin_lock_irq(&device->resource->req_lock);
983		list_add_tail(&peer_req->w.list, &device->net_ee);
984		spin_unlock_irq(&device->resource->req_lock);
985		wake_up(&drbd_pp_wait);
986	} else
987		drbd_free_peer_req(device, peer_req);
988}
989
990/**
991 * w_e_end_data_req() - Worker callback, to send a P_DATA_REPLY packet in response to a P_DATA_REQUEST
992 * @device:	DRBD device.
993 * @w:		work object.
994 * @cancel:	The connection will be closed anyways
995 */
996int w_e_end_data_req(struct drbd_work *w, int cancel)
997{
998	struct drbd_peer_request *peer_req = container_of(w, struct drbd_peer_request, w);
999	struct drbd_peer_device *peer_device = peer_req->peer_device;
1000	struct drbd_device *device = peer_device->device;
1001	int err;
1002
1003	if (unlikely(cancel)) {
1004		drbd_free_peer_req(device, peer_req);
1005		dec_unacked(device);
1006		return 0;
1007	}
1008
1009	if (likely((peer_req->flags & EE_WAS_ERROR) == 0)) {
1010		err = drbd_send_block(peer_device, P_DATA_REPLY, peer_req);
1011	} else {
1012		if (__ratelimit(&drbd_ratelimit_state))
1013			drbd_err(device, "Sending NegDReply. sector=%llus.\n",
1014			    (unsigned long long)peer_req->i.sector);
1015
1016		err = drbd_send_ack(peer_device, P_NEG_DREPLY, peer_req);
1017	}
1018
1019	dec_unacked(device);
1020
1021	move_to_net_ee_or_free(device, peer_req);
1022
1023	if (unlikely(err))
1024		drbd_err(device, "drbd_send_block() failed\n");
1025	return err;
1026}
1027
1028/**
1029 * w_e_end_rsdata_req() - Worker callback to send a P_RS_DATA_REPLY packet in response to a P_RS_DATA_REQUEST
1030 * @w:		work object.
1031 * @cancel:	The connection will be closed anyways
1032 */
1033int w_e_end_rsdata_req(struct drbd_work *w, int cancel)
1034{
1035	struct drbd_peer_request *peer_req = container_of(w, struct drbd_peer_request, w);
1036	struct drbd_peer_device *peer_device = peer_req->peer_device;
1037	struct drbd_device *device = peer_device->device;
1038	int err;
1039
1040	if (unlikely(cancel)) {
1041		drbd_free_peer_req(device, peer_req);
1042		dec_unacked(device);
1043		return 0;
1044	}
1045
1046	if (get_ldev_if_state(device, D_FAILED)) {
1047		drbd_rs_complete_io(device, peer_req->i.sector);
1048		put_ldev(device);
1049	}
1050
1051	if (device->state.conn == C_AHEAD) {
1052		err = drbd_send_ack(peer_device, P_RS_CANCEL, peer_req);
1053	} else if (likely((peer_req->flags & EE_WAS_ERROR) == 0)) {
1054		if (likely(device->state.pdsk >= D_INCONSISTENT)) {
1055			inc_rs_pending(device);
1056			err = drbd_send_block(peer_device, P_RS_DATA_REPLY, peer_req);
1057		} else {
1058			if (__ratelimit(&drbd_ratelimit_state))
1059				drbd_err(device, "Not sending RSDataReply, "
1060				    "partner DISKLESS!\n");
1061			err = 0;
1062		}
1063	} else {
1064		if (__ratelimit(&drbd_ratelimit_state))
1065			drbd_err(device, "Sending NegRSDReply. sector %llus.\n",
1066			    (unsigned long long)peer_req->i.sector);
1067
1068		err = drbd_send_ack(peer_device, P_NEG_RS_DREPLY, peer_req);
1069
1070		/* update resync data with failure */
1071		drbd_rs_failed_io(device, peer_req->i.sector, peer_req->i.size);
1072	}
1073
1074	dec_unacked(device);
1075
1076	move_to_net_ee_or_free(device, peer_req);
1077
1078	if (unlikely(err))
1079		drbd_err(device, "drbd_send_block() failed\n");
1080	return err;
1081}
1082
1083int w_e_end_csum_rs_req(struct drbd_work *w, int cancel)
1084{
1085	struct drbd_peer_request *peer_req = container_of(w, struct drbd_peer_request, w);
1086	struct drbd_peer_device *peer_device = peer_req->peer_device;
1087	struct drbd_device *device = peer_device->device;
1088	struct digest_info *di;
1089	int digest_size;
1090	void *digest = NULL;
1091	int err, eq = 0;
1092
1093	if (unlikely(cancel)) {
1094		drbd_free_peer_req(device, peer_req);
1095		dec_unacked(device);
1096		return 0;
1097	}
1098
1099	if (get_ldev(device)) {
1100		drbd_rs_complete_io(device, peer_req->i.sector);
1101		put_ldev(device);
1102	}
1103
1104	di = peer_req->digest;
1105
1106	if (likely((peer_req->flags & EE_WAS_ERROR) == 0)) {
1107		/* quick hack to try to avoid a race against reconfiguration.
1108		 * a real fix would be much more involved,
1109		 * introducing more locking mechanisms */
1110		if (peer_device->connection->csums_tfm) {
1111			digest_size = crypto_hash_digestsize(peer_device->connection->csums_tfm);
1112			D_ASSERT(device, digest_size == di->digest_size);
1113			digest = kmalloc(digest_size, GFP_NOIO);
1114		}
1115		if (digest) {
1116			drbd_csum_ee(peer_device->connection->csums_tfm, peer_req, digest);
1117			eq = !memcmp(digest, di->digest, digest_size);
1118			kfree(digest);
1119		}
1120
1121		if (eq) {
1122			drbd_set_in_sync(device, peer_req->i.sector, peer_req->i.size);
1123			/* rs_same_csums unit is BM_BLOCK_SIZE */
1124			device->rs_same_csum += peer_req->i.size >> BM_BLOCK_SHIFT;
1125			err = drbd_send_ack(peer_device, P_RS_IS_IN_SYNC, peer_req);
1126		} else {
1127			inc_rs_pending(device);
1128			peer_req->block_id = ID_SYNCER; /* By setting block_id, digest pointer becomes invalid! */
1129			peer_req->flags &= ~EE_HAS_DIGEST; /* This peer request no longer has a digest pointer */
1130			kfree(di);
1131			err = drbd_send_block(peer_device, P_RS_DATA_REPLY, peer_req);
1132		}
1133	} else {
1134		err = drbd_send_ack(peer_device, P_NEG_RS_DREPLY, peer_req);
1135		if (__ratelimit(&drbd_ratelimit_state))
1136			drbd_err(device, "Sending NegDReply. I guess it gets messy.\n");
1137	}
1138
1139	dec_unacked(device);
1140	move_to_net_ee_or_free(device, peer_req);
1141
1142	if (unlikely(err))
1143		drbd_err(device, "drbd_send_block/ack() failed\n");
1144	return err;
1145}
1146
1147int w_e_end_ov_req(struct drbd_work *w, int cancel)
1148{
1149	struct drbd_peer_request *peer_req = container_of(w, struct drbd_peer_request, w);
1150	struct drbd_peer_device *peer_device = peer_req->peer_device;
1151	struct drbd_device *device = peer_device->device;
1152	sector_t sector = peer_req->i.sector;
1153	unsigned int size = peer_req->i.size;
1154	int digest_size;
1155	void *digest;
1156	int err = 0;
1157
1158	if (unlikely(cancel))
1159		goto out;
1160
1161	digest_size = crypto_hash_digestsize(peer_device->connection->verify_tfm);
1162	digest = kmalloc(digest_size, GFP_NOIO);
1163	if (!digest) {
1164		err = 1;	/* terminate the connection in case the allocation failed */
1165		goto out;
1166	}
1167
1168	if (likely(!(peer_req->flags & EE_WAS_ERROR)))
1169		drbd_csum_ee(peer_device->connection->verify_tfm, peer_req, digest);
1170	else
1171		memset(digest, 0, digest_size);
1172
1173	/* Free e and pages before send.
1174	 * In case we block on congestion, we could otherwise run into
1175	 * some distributed deadlock, if the other side blocks on
1176	 * congestion as well, because our receiver blocks in
1177	 * drbd_alloc_pages due to pp_in_use > max_buffers. */
1178	drbd_free_peer_req(device, peer_req);
1179	peer_req = NULL;
1180	inc_rs_pending(device);
1181	err = drbd_send_drequest_csum(peer_device, sector, size, digest, digest_size, P_OV_REPLY);
1182	if (err)
1183		dec_rs_pending(device);
1184	kfree(digest);
1185
1186out:
1187	if (peer_req)
1188		drbd_free_peer_req(device, peer_req);
1189	dec_unacked(device);
1190	return err;
1191}
1192
1193void drbd_ov_out_of_sync_found(struct drbd_device *device, sector_t sector, int size)
1194{
1195	if (device->ov_last_oos_start + device->ov_last_oos_size == sector) {
1196		device->ov_last_oos_size += size>>9;
1197	} else {
1198		device->ov_last_oos_start = sector;
1199		device->ov_last_oos_size = size>>9;
1200	}
1201	drbd_set_out_of_sync(device, sector, size);
1202}
1203
1204int w_e_end_ov_reply(struct drbd_work *w, int cancel)
1205{
1206	struct drbd_peer_request *peer_req = container_of(w, struct drbd_peer_request, w);
1207	struct drbd_peer_device *peer_device = peer_req->peer_device;
1208	struct drbd_device *device = peer_device->device;
1209	struct digest_info *di;
1210	void *digest;
1211	sector_t sector = peer_req->i.sector;
1212	unsigned int size = peer_req->i.size;
1213	int digest_size;
1214	int err, eq = 0;
1215	bool stop_sector_reached = false;
1216
1217	if (unlikely(cancel)) {
1218		drbd_free_peer_req(device, peer_req);
1219		dec_unacked(device);
1220		return 0;
1221	}
1222
1223	/* after "cancel", because after drbd_disconnect/drbd_rs_cancel_all
1224	 * the resync lru has been cleaned up already */
1225	if (get_ldev(device)) {
1226		drbd_rs_complete_io(device, peer_req->i.sector);
1227		put_ldev(device);
1228	}
1229
1230	di = peer_req->digest;
1231
1232	if (likely((peer_req->flags & EE_WAS_ERROR) == 0)) {
1233		digest_size = crypto_hash_digestsize(peer_device->connection->verify_tfm);
1234		digest = kmalloc(digest_size, GFP_NOIO);
1235		if (digest) {
1236			drbd_csum_ee(peer_device->connection->verify_tfm, peer_req, digest);
1237
1238			D_ASSERT(device, digest_size == di->digest_size);
1239			eq = !memcmp(digest, di->digest, digest_size);
1240			kfree(digest);
1241		}
1242	}
1243
1244	/* Free peer_req and pages before send.
1245	 * In case we block on congestion, we could otherwise run into
1246	 * some distributed deadlock, if the other side blocks on
1247	 * congestion as well, because our receiver blocks in
1248	 * drbd_alloc_pages due to pp_in_use > max_buffers. */
1249	drbd_free_peer_req(device, peer_req);
1250	if (!eq)
1251		drbd_ov_out_of_sync_found(device, sector, size);
1252	else
1253		ov_out_of_sync_print(device);
1254
1255	err = drbd_send_ack_ex(peer_device, P_OV_RESULT, sector, size,
1256			       eq ? ID_IN_SYNC : ID_OUT_OF_SYNC);
1257
1258	dec_unacked(device);
1259
1260	--device->ov_left;
1261
1262	/* let's advance progress step marks only for every other megabyte */
1263	if ((device->ov_left & 0x200) == 0x200)
1264		drbd_advance_rs_marks(device, device->ov_left);
1265
1266	stop_sector_reached = verify_can_do_stop_sector(device) &&
1267		(sector + (size>>9)) >= device->ov_stop_sector;
1268
1269	if (device->ov_left == 0 || stop_sector_reached) {
1270		ov_out_of_sync_print(device);
1271		drbd_resync_finished(device);
1272	}
1273
1274	return err;
1275}
1276
1277/* FIXME
1278 * We need to track the number of pending barrier acks,
1279 * and to be able to wait for them.
1280 * See also comment in drbd_adm_attach before drbd_suspend_io.
1281 */
1282static int drbd_send_barrier(struct drbd_connection *connection)
1283{
1284	struct p_barrier *p;
1285	struct drbd_socket *sock;
1286
1287	sock = &connection->data;
1288	p = conn_prepare_command(connection, sock);
1289	if (!p)
1290		return -EIO;
1291	p->barrier = connection->send.current_epoch_nr;
1292	p->pad = 0;
1293	connection->send.current_epoch_writes = 0;
1294
1295	return conn_send_command(connection, sock, P_BARRIER, sizeof(*p), NULL, 0);
1296}
1297
1298int w_send_write_hint(struct drbd_work *w, int cancel)
1299{
1300	struct drbd_device *device =
1301		container_of(w, struct drbd_device, unplug_work);
1302	struct drbd_socket *sock;
1303
1304	if (cancel)
1305		return 0;
1306	sock = &first_peer_device(device)->connection->data;
1307	if (!drbd_prepare_command(first_peer_device(device), sock))
1308		return -EIO;
1309	return drbd_send_command(first_peer_device(device), sock, P_UNPLUG_REMOTE, 0, NULL, 0);
1310}
1311
1312static void re_init_if_first_write(struct drbd_connection *connection, unsigned int epoch)
1313{
1314	if (!connection->send.seen_any_write_yet) {
1315		connection->send.seen_any_write_yet = true;
1316		connection->send.current_epoch_nr = epoch;
1317		connection->send.current_epoch_writes = 0;
1318	}
1319}
1320
1321static void maybe_send_barrier(struct drbd_connection *connection, unsigned int epoch)
1322{
1323	/* re-init if first write on this connection */
1324	if (!connection->send.seen_any_write_yet)
1325		return;
1326	if (connection->send.current_epoch_nr != epoch) {
1327		if (connection->send.current_epoch_writes)
1328			drbd_send_barrier(connection);
1329		connection->send.current_epoch_nr = epoch;
1330	}
1331}
1332
1333int w_send_out_of_sync(struct drbd_work *w, int cancel)
1334{
1335	struct drbd_request *req = container_of(w, struct drbd_request, w);
1336	struct drbd_device *device = req->device;
1337	struct drbd_peer_device *const peer_device = first_peer_device(device);
1338	struct drbd_connection *const connection = peer_device->connection;
1339	int err;
1340
1341	if (unlikely(cancel)) {
1342		req_mod(req, SEND_CANCELED);
1343		return 0;
1344	}
1345	req->pre_send_jif = jiffies;
1346
1347	/* this time, no connection->send.current_epoch_writes++;
1348	 * If it was sent, it was the closing barrier for the last
1349	 * replicated epoch, before we went into AHEAD mode.
1350	 * No more barriers will be sent, until we leave AHEAD mode again. */
1351	maybe_send_barrier(connection, req->epoch);
1352
1353	err = drbd_send_out_of_sync(peer_device, req);
1354	req_mod(req, OOS_HANDED_TO_NETWORK);
1355
1356	return err;
1357}
1358
1359/**
1360 * w_send_dblock() - Worker callback to send a P_DATA packet in order to mirror a write request
1361 * @w:		work object.
1362 * @cancel:	The connection will be closed anyways
1363 */
1364int w_send_dblock(struct drbd_work *w, int cancel)
1365{
1366	struct drbd_request *req = container_of(w, struct drbd_request, w);
1367	struct drbd_device *device = req->device;
1368	struct drbd_peer_device *const peer_device = first_peer_device(device);
1369	struct drbd_connection *connection = peer_device->connection;
1370	int err;
1371
1372	if (unlikely(cancel)) {
1373		req_mod(req, SEND_CANCELED);
1374		return 0;
1375	}
1376	req->pre_send_jif = jiffies;
1377
1378	re_init_if_first_write(connection, req->epoch);
1379	maybe_send_barrier(connection, req->epoch);
1380	connection->send.current_epoch_writes++;
1381
1382	err = drbd_send_dblock(peer_device, req);
1383	req_mod(req, err ? SEND_FAILED : HANDED_OVER_TO_NETWORK);
1384
1385	return err;
1386}
1387
1388/**
1389 * w_send_read_req() - Worker callback to send a read request (P_DATA_REQUEST) packet
1390 * @w:		work object.
1391 * @cancel:	The connection will be closed anyways
1392 */
1393int w_send_read_req(struct drbd_work *w, int cancel)
1394{
1395	struct drbd_request *req = container_of(w, struct drbd_request, w);
1396	struct drbd_device *device = req->device;
1397	struct drbd_peer_device *const peer_device = first_peer_device(device);
1398	struct drbd_connection *connection = peer_device->connection;
1399	int err;
1400
1401	if (unlikely(cancel)) {
1402		req_mod(req, SEND_CANCELED);
1403		return 0;
1404	}
1405	req->pre_send_jif = jiffies;
1406
1407	/* Even read requests may close a write epoch,
1408	 * if there was any yet. */
1409	maybe_send_barrier(connection, req->epoch);
1410
1411	err = drbd_send_drequest(peer_device, P_DATA_REQUEST, req->i.sector, req->i.size,
1412				 (unsigned long)req);
1413
1414	req_mod(req, err ? SEND_FAILED : HANDED_OVER_TO_NETWORK);
1415
1416	return err;
1417}
1418
1419int w_restart_disk_io(struct drbd_work *w, int cancel)
1420{
1421	struct drbd_request *req = container_of(w, struct drbd_request, w);
1422	struct drbd_device *device = req->device;
1423
1424	if (bio_data_dir(req->master_bio) == WRITE && req->rq_state & RQ_IN_ACT_LOG)
1425		drbd_al_begin_io(device, &req->i);
1426
1427	drbd_req_make_private_bio(req, req->master_bio);
1428	req->private_bio->bi_bdev = device->ldev->backing_bdev;
1429	generic_make_request(req->private_bio);
1430
1431	return 0;
1432}
1433
1434static int _drbd_may_sync_now(struct drbd_device *device)
1435{
1436	struct drbd_device *odev = device;
1437	int resync_after;
1438
1439	while (1) {
1440		if (!odev->ldev || odev->state.disk == D_DISKLESS)
1441			return 1;
1442		rcu_read_lock();
1443		resync_after = rcu_dereference(odev->ldev->disk_conf)->resync_after;
1444		rcu_read_unlock();
1445		if (resync_after == -1)
1446			return 1;
1447		odev = minor_to_device(resync_after);
1448		if (!odev)
1449			return 1;
1450		if ((odev->state.conn >= C_SYNC_SOURCE &&
1451		     odev->state.conn <= C_PAUSED_SYNC_T) ||
1452		    odev->state.aftr_isp || odev->state.peer_isp ||
1453		    odev->state.user_isp)
1454			return 0;
1455	}
1456}
1457
1458/**
1459 * _drbd_pause_after() - Pause resync on all devices that may not resync now
1460 * @device:	DRBD device.
1461 *
1462 * Called from process context only (admin command and after_state_ch).
1463 */
1464static int _drbd_pause_after(struct drbd_device *device)
1465{
1466	struct drbd_device *odev;
1467	int i, rv = 0;
1468
1469	rcu_read_lock();
1470	idr_for_each_entry(&drbd_devices, odev, i) {
1471		if (odev->state.conn == C_STANDALONE && odev->state.disk == D_DISKLESS)
1472			continue;
1473		if (!_drbd_may_sync_now(odev))
1474			rv |= (__drbd_set_state(_NS(odev, aftr_isp, 1), CS_HARD, NULL)
1475			       != SS_NOTHING_TO_DO);
1476	}
1477	rcu_read_unlock();
1478
1479	return rv;
1480}
1481
1482/**
1483 * _drbd_resume_next() - Resume resync on all devices that may resync now
1484 * @device:	DRBD device.
1485 *
1486 * Called from process context only (admin command and worker).
1487 */
1488static int _drbd_resume_next(struct drbd_device *device)
1489{
1490	struct drbd_device *odev;
1491	int i, rv = 0;
1492
1493	rcu_read_lock();
1494	idr_for_each_entry(&drbd_devices, odev, i) {
1495		if (odev->state.conn == C_STANDALONE && odev->state.disk == D_DISKLESS)
1496			continue;
1497		if (odev->state.aftr_isp) {
1498			if (_drbd_may_sync_now(odev))
1499				rv |= (__drbd_set_state(_NS(odev, aftr_isp, 0),
1500							CS_HARD, NULL)
1501				       != SS_NOTHING_TO_DO) ;
1502		}
1503	}
1504	rcu_read_unlock();
1505	return rv;
1506}
1507
1508void resume_next_sg(struct drbd_device *device)
1509{
1510	write_lock_irq(&global_state_lock);
1511	_drbd_resume_next(device);
1512	write_unlock_irq(&global_state_lock);
1513}
1514
1515void suspend_other_sg(struct drbd_device *device)
1516{
1517	write_lock_irq(&global_state_lock);
1518	_drbd_pause_after(device);
1519	write_unlock_irq(&global_state_lock);
1520}
1521
1522/* caller must hold global_state_lock */
1523enum drbd_ret_code drbd_resync_after_valid(struct drbd_device *device, int o_minor)
1524{
1525	struct drbd_device *odev;
1526	int resync_after;
1527
1528	if (o_minor == -1)
1529		return NO_ERROR;
1530	if (o_minor < -1 || o_minor > MINORMASK)
1531		return ERR_RESYNC_AFTER;
1532
1533	/* check for loops */
1534	odev = minor_to_device(o_minor);
1535	while (1) {
1536		if (odev == device)
1537			return ERR_RESYNC_AFTER_CYCLE;
1538
1539		/* You are free to depend on diskless, non-existing,
1540		 * or not yet/no longer existing minors.
1541		 * We only reject dependency loops.
1542		 * We cannot follow the dependency chain beyond a detached or
1543		 * missing minor.
1544		 */
1545		if (!odev || !odev->ldev || odev->state.disk == D_DISKLESS)
1546			return NO_ERROR;
1547
1548		rcu_read_lock();
1549		resync_after = rcu_dereference(odev->ldev->disk_conf)->resync_after;
1550		rcu_read_unlock();
1551		/* dependency chain ends here, no cycles. */
1552		if (resync_after == -1)
1553			return NO_ERROR;
1554
1555		/* follow the dependency chain */
1556		odev = minor_to_device(resync_after);
1557	}
1558}
1559
1560/* caller must hold global_state_lock */
1561void drbd_resync_after_changed(struct drbd_device *device)
1562{
1563	int changes;
1564
1565	do {
1566		changes  = _drbd_pause_after(device);
1567		changes |= _drbd_resume_next(device);
1568	} while (changes);
1569}
1570
1571void drbd_rs_controller_reset(struct drbd_device *device)
1572{
1573	struct gendisk *disk = device->ldev->backing_bdev->bd_contains->bd_disk;
1574	struct fifo_buffer *plan;
1575
1576	atomic_set(&device->rs_sect_in, 0);
1577	atomic_set(&device->rs_sect_ev, 0);
1578	device->rs_in_flight = 0;
1579	device->rs_last_events =
1580		(int)part_stat_read(&disk->part0, sectors[0]) +
1581		(int)part_stat_read(&disk->part0, sectors[1]);
1582
1583	/* Updating the RCU protected object in place is necessary since
1584	   this function gets called from atomic context.
1585	   It is valid since all other updates also lead to an completely
1586	   empty fifo */
1587	rcu_read_lock();
1588	plan = rcu_dereference(device->rs_plan_s);
1589	plan->total = 0;
1590	fifo_set(plan, 0);
1591	rcu_read_unlock();
1592}
1593
1594void start_resync_timer_fn(unsigned long data)
1595{
1596	struct drbd_device *device = (struct drbd_device *) data;
1597	drbd_device_post_work(device, RS_START);
1598}
1599
1600static void do_start_resync(struct drbd_device *device)
1601{
1602	if (atomic_read(&device->unacked_cnt) || atomic_read(&device->rs_pending_cnt)) {
1603		drbd_warn(device, "postponing start_resync ...\n");
1604		device->start_resync_timer.expires = jiffies + HZ/10;
1605		add_timer(&device->start_resync_timer);
1606		return;
1607	}
1608
1609	drbd_start_resync(device, C_SYNC_SOURCE);
1610	clear_bit(AHEAD_TO_SYNC_SOURCE, &device->flags);
1611}
1612
1613static bool use_checksum_based_resync(struct drbd_connection *connection, struct drbd_device *device)
1614{
1615	bool csums_after_crash_only;
1616	rcu_read_lock();
1617	csums_after_crash_only = rcu_dereference(connection->net_conf)->csums_after_crash_only;
1618	rcu_read_unlock();
1619	return connection->agreed_pro_version >= 89 &&		/* supported? */
1620		connection->csums_tfm &&			/* configured? */
1621		(csums_after_crash_only == 0			/* use for each resync? */
1622		 || test_bit(CRASHED_PRIMARY, &device->flags));	/* or only after Primary crash? */
1623}
1624
1625/**
1626 * drbd_start_resync() - Start the resync process
1627 * @device:	DRBD device.
1628 * @side:	Either C_SYNC_SOURCE or C_SYNC_TARGET
1629 *
1630 * This function might bring you directly into one of the
1631 * C_PAUSED_SYNC_* states.
1632 */
1633void drbd_start_resync(struct drbd_device *device, enum drbd_conns side)
1634{
1635	struct drbd_peer_device *peer_device = first_peer_device(device);
1636	struct drbd_connection *connection = peer_device ? peer_device->connection : NULL;
1637	union drbd_state ns;
1638	int r;
1639
1640	if (device->state.conn >= C_SYNC_SOURCE && device->state.conn < C_AHEAD) {
1641		drbd_err(device, "Resync already running!\n");
1642		return;
1643	}
1644
1645	if (!test_bit(B_RS_H_DONE, &device->flags)) {
1646		if (side == C_SYNC_TARGET) {
1647			/* Since application IO was locked out during C_WF_BITMAP_T and
1648			   C_WF_SYNC_UUID we are still unmodified. Before going to C_SYNC_TARGET
1649			   we check that we might make the data inconsistent. */
1650			r = drbd_khelper(device, "before-resync-target");
1651			r = (r >> 8) & 0xff;
1652			if (r > 0) {
1653				drbd_info(device, "before-resync-target handler returned %d, "
1654					 "dropping connection.\n", r);
1655				conn_request_state(connection, NS(conn, C_DISCONNECTING), CS_HARD);
1656				return;
1657			}
1658		} else /* C_SYNC_SOURCE */ {
1659			r = drbd_khelper(device, "before-resync-source");
1660			r = (r >> 8) & 0xff;
1661			if (r > 0) {
1662				if (r == 3) {
1663					drbd_info(device, "before-resync-source handler returned %d, "
1664						 "ignoring. Old userland tools?", r);
1665				} else {
1666					drbd_info(device, "before-resync-source handler returned %d, "
1667						 "dropping connection.\n", r);
1668					conn_request_state(connection,
1669							   NS(conn, C_DISCONNECTING), CS_HARD);
1670					return;
1671				}
1672			}
1673		}
1674	}
1675
1676	if (current == connection->worker.task) {
1677		/* The worker should not sleep waiting for state_mutex,
1678		   that can take long */
1679		if (!mutex_trylock(device->state_mutex)) {
1680			set_bit(B_RS_H_DONE, &device->flags);
1681			device->start_resync_timer.expires = jiffies + HZ/5;
1682			add_timer(&device->start_resync_timer);
1683			return;
1684		}
1685	} else {
1686		mutex_lock(device->state_mutex);
1687	}
1688	clear_bit(B_RS_H_DONE, &device->flags);
1689
1690	/* req_lock: serialize with drbd_send_and_submit() and others
1691	 * global_state_lock: for stable sync-after dependencies */
1692	spin_lock_irq(&device->resource->req_lock);
1693	write_lock(&global_state_lock);
1694	/* Did some connection breakage or IO error race with us? */
1695	if (device->state.conn < C_CONNECTED
1696	|| !get_ldev_if_state(device, D_NEGOTIATING)) {
1697		write_unlock(&global_state_lock);
1698		spin_unlock_irq(&device->resource->req_lock);
1699		mutex_unlock(device->state_mutex);
1700		return;
1701	}
1702
1703	ns = drbd_read_state(device);
1704
1705	ns.aftr_isp = !_drbd_may_sync_now(device);
1706
1707	ns.conn = side;
1708
1709	if (side == C_SYNC_TARGET)
1710		ns.disk = D_INCONSISTENT;
1711	else /* side == C_SYNC_SOURCE */
1712		ns.pdsk = D_INCONSISTENT;
1713
1714	r = __drbd_set_state(device, ns, CS_VERBOSE, NULL);
1715	ns = drbd_read_state(device);
1716
1717	if (ns.conn < C_CONNECTED)
1718		r = SS_UNKNOWN_ERROR;
1719
1720	if (r == SS_SUCCESS) {
1721		unsigned long tw = drbd_bm_total_weight(device);
1722		unsigned long now = jiffies;
1723		int i;
1724
1725		device->rs_failed    = 0;
1726		device->rs_paused    = 0;
1727		device->rs_same_csum = 0;
1728		device->rs_last_sect_ev = 0;
1729		device->rs_total     = tw;
1730		device->rs_start     = now;
1731		for (i = 0; i < DRBD_SYNC_MARKS; i++) {
1732			device->rs_mark_left[i] = tw;
1733			device->rs_mark_time[i] = now;
1734		}
1735		_drbd_pause_after(device);
1736		/* Forget potentially stale cached per resync extent bit-counts.
1737		 * Open coded drbd_rs_cancel_all(device), we already have IRQs
1738		 * disabled, and know the disk state is ok. */
1739		spin_lock(&device->al_lock);
1740		lc_reset(device->resync);
1741		device->resync_locked = 0;
1742		device->resync_wenr = LC_FREE;
1743		spin_unlock(&device->al_lock);
1744	}
1745	write_unlock(&global_state_lock);
1746	spin_unlock_irq(&device->resource->req_lock);
1747
1748	if (r == SS_SUCCESS) {
1749		wake_up(&device->al_wait); /* for lc_reset() above */
1750		/* reset rs_last_bcast when a resync or verify is started,
1751		 * to deal with potential jiffies wrap. */
1752		device->rs_last_bcast = jiffies - HZ;
1753
1754		drbd_info(device, "Began resync as %s (will sync %lu KB [%lu bits set]).\n",
1755		     drbd_conn_str(ns.conn),
1756		     (unsigned long) device->rs_total << (BM_BLOCK_SHIFT-10),
1757		     (unsigned long) device->rs_total);
1758		if (side == C_SYNC_TARGET) {
1759			device->bm_resync_fo = 0;
1760			device->use_csums = use_checksum_based_resync(connection, device);
1761		} else {
1762			device->use_csums = 0;
1763		}
1764
1765		/* Since protocol 96, we must serialize drbd_gen_and_send_sync_uuid
1766		 * with w_send_oos, or the sync target will get confused as to
1767		 * how much bits to resync.  We cannot do that always, because for an
1768		 * empty resync and protocol < 95, we need to do it here, as we call
1769		 * drbd_resync_finished from here in that case.
1770		 * We drbd_gen_and_send_sync_uuid here for protocol < 96,
1771		 * and from after_state_ch otherwise. */
1772		if (side == C_SYNC_SOURCE && connection->agreed_pro_version < 96)
1773			drbd_gen_and_send_sync_uuid(peer_device);
1774
1775		if (connection->agreed_pro_version < 95 && device->rs_total == 0) {
1776			/* This still has a race (about when exactly the peers
1777			 * detect connection loss) that can lead to a full sync
1778			 * on next handshake. In 8.3.9 we fixed this with explicit
1779			 * resync-finished notifications, but the fix
1780			 * introduces a protocol change.  Sleeping for some
1781			 * time longer than the ping interval + timeout on the
1782			 * SyncSource, to give the SyncTarget the chance to
1783			 * detect connection loss, then waiting for a ping
1784			 * response (implicit in drbd_resync_finished) reduces
1785			 * the race considerably, but does not solve it. */
1786			if (side == C_SYNC_SOURCE) {
1787				struct net_conf *nc;
1788				int timeo;
1789
1790				rcu_read_lock();
1791				nc = rcu_dereference(connection->net_conf);
1792				timeo = nc->ping_int * HZ + nc->ping_timeo * HZ / 9;
1793				rcu_read_unlock();
1794				schedule_timeout_interruptible(timeo);
1795			}
1796			drbd_resync_finished(device);
1797		}
1798
1799		drbd_rs_controller_reset(device);
1800		/* ns.conn may already be != device->state.conn,
1801		 * we may have been paused in between, or become paused until
1802		 * the timer triggers.
1803		 * No matter, that is handled in resync_timer_fn() */
1804		if (ns.conn == C_SYNC_TARGET)
1805			mod_timer(&device->resync_timer, jiffies);
1806
1807		drbd_md_sync(device);
1808	}
1809	put_ldev(device);
1810	mutex_unlock(device->state_mutex);
1811}
1812
1813static void update_on_disk_bitmap(struct drbd_device *device, bool resync_done)
1814{
1815	struct sib_info sib = { .sib_reason = SIB_SYNC_PROGRESS, };
1816	device->rs_last_bcast = jiffies;
1817
1818	if (!get_ldev(device))
1819		return;
1820
1821	drbd_bm_write_lazy(device, 0);
1822	if (resync_done && is_sync_state(device->state.conn))
1823		drbd_resync_finished(device);
1824
1825	drbd_bcast_event(device, &sib);
1826	/* update timestamp, in case it took a while to write out stuff */
1827	device->rs_last_bcast = jiffies;
1828	put_ldev(device);
1829}
1830
1831static void drbd_ldev_destroy(struct drbd_device *device)
1832{
1833	lc_destroy(device->resync);
1834	device->resync = NULL;
1835	lc_destroy(device->act_log);
1836	device->act_log = NULL;
1837
1838	__acquire(local);
1839	drbd_free_ldev(device->ldev);
1840	device->ldev = NULL;
1841	__release(local);
1842
1843	clear_bit(GOING_DISKLESS, &device->flags);
1844	wake_up(&device->misc_wait);
1845}
1846
1847static void go_diskless(struct drbd_device *device)
1848{
1849	D_ASSERT(device, device->state.disk == D_FAILED);
1850	/* we cannot assert local_cnt == 0 here, as get_ldev_if_state will
1851	 * inc/dec it frequently. Once we are D_DISKLESS, no one will touch
1852	 * the protected members anymore, though, so once put_ldev reaches zero
1853	 * again, it will be safe to free them. */
1854
1855	/* Try to write changed bitmap pages, read errors may have just
1856	 * set some bits outside the area covered by the activity log.
1857	 *
1858	 * If we have an IO error during the bitmap writeout,
1859	 * we will want a full sync next time, just in case.
1860	 * (Do we want a specific meta data flag for this?)
1861	 *
1862	 * If that does not make it to stable storage either,
1863	 * we cannot do anything about that anymore.
1864	 *
1865	 * We still need to check if both bitmap and ldev are present, we may
1866	 * end up here after a failed attach, before ldev was even assigned.
1867	 */
1868	if (device->bitmap && device->ldev) {
1869		/* An interrupted resync or similar is allowed to recounts bits
1870		 * while we detach.
1871		 * Any modifications would not be expected anymore, though.
1872		 */
1873		if (drbd_bitmap_io_from_worker(device, drbd_bm_write,
1874					"detach", BM_LOCKED_TEST_ALLOWED)) {
1875			if (test_bit(WAS_READ_ERROR, &device->flags)) {
1876				drbd_md_set_flag(device, MDF_FULL_SYNC);
1877				drbd_md_sync(device);
1878			}
1879		}
1880	}
1881
1882	drbd_force_state(device, NS(disk, D_DISKLESS));
1883}
1884
1885static int do_md_sync(struct drbd_device *device)
1886{
1887	drbd_warn(device, "md_sync_timer expired! Worker calls drbd_md_sync().\n");
1888	drbd_md_sync(device);
1889	return 0;
1890}
1891
1892/* only called from drbd_worker thread, no locking */
1893void __update_timing_details(
1894		struct drbd_thread_timing_details *tdp,
1895		unsigned int *cb_nr,
1896		void *cb,
1897		const char *fn, const unsigned int line)
1898{
1899	unsigned int i = *cb_nr % DRBD_THREAD_DETAILS_HIST;
1900	struct drbd_thread_timing_details *td = tdp + i;
1901
1902	td->start_jif = jiffies;
1903	td->cb_addr = cb;
1904	td->caller_fn = fn;
1905	td->line = line;
1906	td->cb_nr = *cb_nr;
1907
1908	i = (i+1) % DRBD_THREAD_DETAILS_HIST;
1909	td = tdp + i;
1910	memset(td, 0, sizeof(*td));
1911
1912	++(*cb_nr);
1913}
1914
1915static void do_device_work(struct drbd_device *device, const unsigned long todo)
1916{
1917	if (test_bit(MD_SYNC, &todo))
1918		do_md_sync(device);
1919	if (test_bit(RS_DONE, &todo) ||
1920	    test_bit(RS_PROGRESS, &todo))
1921		update_on_disk_bitmap(device, test_bit(RS_DONE, &todo));
1922	if (test_bit(GO_DISKLESS, &todo))
1923		go_diskless(device);
1924	if (test_bit(DESTROY_DISK, &todo))
1925		drbd_ldev_destroy(device);
1926	if (test_bit(RS_START, &todo))
1927		do_start_resync(device);
1928}
1929
1930#define DRBD_DEVICE_WORK_MASK	\
1931	((1UL << GO_DISKLESS)	\
1932	|(1UL << DESTROY_DISK)	\
1933	|(1UL << MD_SYNC)	\
1934	|(1UL << RS_START)	\
1935	|(1UL << RS_PROGRESS)	\
1936	|(1UL << RS_DONE)	\
1937	)
1938
1939static unsigned long get_work_bits(unsigned long *flags)
1940{
1941	unsigned long old, new;
1942	do {
1943		old = *flags;
1944		new = old & ~DRBD_DEVICE_WORK_MASK;
1945	} while (cmpxchg(flags, old, new) != old);
1946	return old & DRBD_DEVICE_WORK_MASK;
1947}
1948
1949static void do_unqueued_work(struct drbd_connection *connection)
1950{
1951	struct drbd_peer_device *peer_device;
1952	int vnr;
1953
1954	rcu_read_lock();
1955	idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
1956		struct drbd_device *device = peer_device->device;
1957		unsigned long todo = get_work_bits(&device->flags);
1958		if (!todo)
1959			continue;
1960
1961		kref_get(&device->kref);
1962		rcu_read_unlock();
1963		do_device_work(device, todo);
1964		kref_put(&device->kref, drbd_destroy_device);
1965		rcu_read_lock();
1966	}
1967	rcu_read_unlock();
1968}
1969
1970static bool dequeue_work_batch(struct drbd_work_queue *queue, struct list_head *work_list)
1971{
1972	spin_lock_irq(&queue->q_lock);
1973	list_splice_tail_init(&queue->q, work_list);
1974	spin_unlock_irq(&queue->q_lock);
1975	return !list_empty(work_list);
1976}
1977
1978static void wait_for_work(struct drbd_connection *connection, struct list_head *work_list)
1979{
1980	DEFINE_WAIT(wait);
1981	struct net_conf *nc;
1982	int uncork, cork;
1983
1984	dequeue_work_batch(&connection->sender_work, work_list);
1985	if (!list_empty(work_list))
1986		return;
1987
1988	/* Still nothing to do?
1989	 * Maybe we still need to close the current epoch,
1990	 * even if no new requests are queued yet.
1991	 *
1992	 * Also, poke TCP, just in case.
1993	 * Then wait for new work (or signal). */
1994	rcu_read_lock();
1995	nc = rcu_dereference(connection->net_conf);
1996	uncork = nc ? nc->tcp_cork : 0;
1997	rcu_read_unlock();
1998	if (uncork) {
1999		mutex_lock(&connection->data.mutex);
2000		if (connection->data.socket)
2001			drbd_tcp_uncork(connection->data.socket);
2002		mutex_unlock(&connection->data.mutex);
2003	}
2004
2005	for (;;) {
2006		int send_barrier;
2007		prepare_to_wait(&connection->sender_work.q_wait, &wait, TASK_INTERRUPTIBLE);
2008		spin_lock_irq(&connection->resource->req_lock);
2009		spin_lock(&connection->sender_work.q_lock);	/* FIXME get rid of this one? */
2010		if (!list_empty(&connection->sender_work.q))
2011			list_splice_tail_init(&connection->sender_work.q, work_list);
2012		spin_unlock(&connection->sender_work.q_lock);	/* FIXME get rid of this one? */
2013		if (!list_empty(work_list) || signal_pending(current)) {
2014			spin_unlock_irq(&connection->resource->req_lock);
2015			break;
2016		}
2017
2018		/* We found nothing new to do, no to-be-communicated request,
2019		 * no other work item.  We may still need to close the last
2020		 * epoch.  Next incoming request epoch will be connection ->
2021		 * current transfer log epoch number.  If that is different
2022		 * from the epoch of the last request we communicated, it is
2023		 * safe to send the epoch separating barrier now.
2024		 */
2025		send_barrier =
2026			atomic_read(&connection->current_tle_nr) !=
2027			connection->send.current_epoch_nr;
2028		spin_unlock_irq(&connection->resource->req_lock);
2029
2030		if (send_barrier)
2031			maybe_send_barrier(connection,
2032					connection->send.current_epoch_nr + 1);
2033
2034		if (test_bit(DEVICE_WORK_PENDING, &connection->flags))
2035			break;
2036
2037		/* drbd_send() may have called flush_signals() */
2038		if (get_t_state(&connection->worker) != RUNNING)
2039			break;
2040
2041		schedule();
2042		/* may be woken up for other things but new work, too,
2043		 * e.g. if the current epoch got closed.
2044		 * In which case we send the barrier above. */
2045	}
2046	finish_wait(&connection->sender_work.q_wait, &wait);
2047
2048	/* someone may have changed the config while we have been waiting above. */
2049	rcu_read_lock();
2050	nc = rcu_dereference(connection->net_conf);
2051	cork = nc ? nc->tcp_cork : 0;
2052	rcu_read_unlock();
2053	mutex_lock(&connection->data.mutex);
2054	if (connection->data.socket) {
2055		if (cork)
2056			drbd_tcp_cork(connection->data.socket);
2057		else if (!uncork)
2058			drbd_tcp_uncork(connection->data.socket);
2059	}
2060	mutex_unlock(&connection->data.mutex);
2061}
2062
2063int drbd_worker(struct drbd_thread *thi)
2064{
2065	struct drbd_connection *connection = thi->connection;
2066	struct drbd_work *w = NULL;
2067	struct drbd_peer_device *peer_device;
2068	LIST_HEAD(work_list);
2069	int vnr;
2070
2071	while (get_t_state(thi) == RUNNING) {
2072		drbd_thread_current_set_cpu(thi);
2073
2074		if (list_empty(&work_list)) {
2075			update_worker_timing_details(connection, wait_for_work);
2076			wait_for_work(connection, &work_list);
2077		}
2078
2079		if (test_and_clear_bit(DEVICE_WORK_PENDING, &connection->flags)) {
2080			update_worker_timing_details(connection, do_unqueued_work);
2081			do_unqueued_work(connection);
2082		}
2083
2084		if (signal_pending(current)) {
2085			flush_signals(current);
2086			if (get_t_state(thi) == RUNNING) {
2087				drbd_warn(connection, "Worker got an unexpected signal\n");
2088				continue;
2089			}
2090			break;
2091		}
2092
2093		if (get_t_state(thi) != RUNNING)
2094			break;
2095
2096		if (!list_empty(&work_list)) {
2097			w = list_first_entry(&work_list, struct drbd_work, list);
2098			list_del_init(&w->list);
2099			update_worker_timing_details(connection, w->cb);
2100			if (w->cb(w, connection->cstate < C_WF_REPORT_PARAMS) == 0)
2101				continue;
2102			if (connection->cstate >= C_WF_REPORT_PARAMS)
2103				conn_request_state(connection, NS(conn, C_NETWORK_FAILURE), CS_HARD);
2104		}
2105	}
2106
2107	do {
2108		if (test_and_clear_bit(DEVICE_WORK_PENDING, &connection->flags)) {
2109			update_worker_timing_details(connection, do_unqueued_work);
2110			do_unqueued_work(connection);
2111		}
2112		if (!list_empty(&work_list)) {
2113			w = list_first_entry(&work_list, struct drbd_work, list);
2114			list_del_init(&w->list);
2115			update_worker_timing_details(connection, w->cb);
2116			w->cb(w, 1);
2117		} else
2118			dequeue_work_batch(&connection->sender_work, &work_list);
2119	} while (!list_empty(&work_list) || test_bit(DEVICE_WORK_PENDING, &connection->flags));
2120
2121	rcu_read_lock();
2122	idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
2123		struct drbd_device *device = peer_device->device;
2124		D_ASSERT(device, device->state.disk == D_DISKLESS && device->state.conn == C_STANDALONE);
2125		kref_get(&device->kref);
2126		rcu_read_unlock();
2127		drbd_device_cleanup(device);
2128		kref_put(&device->kref, drbd_destroy_device);
2129		rcu_read_lock();
2130	}
2131	rcu_read_unlock();
2132
2133	return 0;
2134}
2135