Hi
There is an error in do_failures, in that cycle
while ((bio = bio_list_pop(failures))) --- there exists a path where the
bio is not processed at all and the pointer to the bio is lost --- you
forgot the last "else hold_bio". It deadlocked in my testing.
I'm resending these two patches, correct it according to them.
Mikulas

Hold all write bios when errors are handled.
The patch changes these things:
- previously, the failures list was used only when handling errors with
dmeventd. Now, it is used for all bios. Even when not using dmeventd,
the regions where some writes failed must be marked as nosync. This can only
be done in process context (i.e. in raid1 workqueue), not in
the write_callback function.
- previously, the write would succeed if writing to at least one leg succeeded.
This is wrong because data from the failed leg may be replicated to the
correct leg.
Now, if using dmeventd, the write with some failures will fail be held until
dmeventd does its job and reconfigures the array.
If not using dmeventd, the write still succeeds if at least one leg succeeds,
it is wrong but it is consistent with current behavior.
Signed-off-by: Mikulas Patocka <mpatocka redhat com>
---
drivers/md/dm-raid1.c | 62 ++++++++++++++++++++++++++------------------------
1 file changed, 33 insertions(+), 29 deletions(-)
Index: linux-2.6.32-rc8-devel/drivers/md/dm-raid1.c
===================================================================
--- linux-2.6.32-rc8-devel.orig/drivers/md/dm-raid1.c 2009-11-26 00:55:21.000000000 +0100
+++ linux-2.6.32-rc8-devel/drivers/md/dm-raid1.c 2009-11-26 00:59:58.000000000 +0100
@@ -543,7 +543,6 @@ static void write_callback(unsigned long
unsigned i, ret = 0;
struct bio *bio = (struct bio *) context;
struct mirror_set *ms;
- int uptodate = 0;
unsigned long flags;
ms = bio_get_m(bio)->ms;
@@ -555,33 +554,23 @@ static void write_callback(unsigned long
* This way we handle both writes to SYNC and NOSYNC
* regions with the same code.
*/
- if (likely(!error))
- goto out;
+ if (likely(!error)) {
+ bio_endio(bio, ret);
+ return;
+ }
for (i = 0; i < ms->nr_mirrors; i++)
if (test_bit(i, &error))
fail_mirror(ms->mirror + i, DM_RAID1_WRITE_ERROR);
- else
- uptodate = 1;
- if (unlikely(!uptodate)) {
- DMERR("All replicated volumes dead, failing I/O");
- /* None of the writes succeeded, fail the I/O. */
- ret = -EIO;
- } else if (errors_handled(ms)) {
- /*
- * Need to raise event. Since raising
- * events can block, we need to do it in
- * the main thread.
- */
- spin_lock_irqsave(&ms->lock, flags);
- bio_list_add(&ms->failures, bio);
- spin_unlock_irqrestore(&ms->lock, flags);
- wakeup_mirrord(ms);
- return;
- }
-out:
- bio_endio(bio, ret);
+ /*
+ * In either case we must mark the region as NOSYNC.
+ * That would block, so do it in the thread.
+ */
+ spin_lock_irqsave(&ms->lock, flags);
+ bio_list_add(&ms->failures, bio);
+ spin_unlock_irqrestore(&ms->lock, flags);
+ wakeup_mirrord(ms);
}
static void do_write(struct mirror_set *ms, struct bio *bio)
@@ -736,13 +725,28 @@ static void do_failures(struct mirror_se
*/
while ((bio = bio_list_pop(failures))) {
- if (ms->log_failure)
- hold_bio(ms, bio);
- else {
- ms->in_sync = 0;
- dm_rh_mark_nosync(ms->rh, bio);
- bio_endio(bio, 0);
+ if (!ms->log_failure) {
+ ms->in_sync = 0;
+ dm_rh_mark_nosync(ms->rh, bio);
}
+ /*
+ * If all the legs are dead, fail the I/O.
+ *
+ * If we are not using dmeventd, we pretend that the I/O
+ * succeeded. This is wrong (the failed leg might come online
+ * again after reboot and it would be replicated back to
+ * the good leg) but it is consistent with current behavior.
+ * For proper behavior, dm-raid1 shouldn't be used without
+ * dmeventd at all.
+ *
+ * If we use dmeventd, hold the bio until dmeventd does its job.
+ */
+ if (!get_valid_mirror(ms))
+ bio_endio(bio, -EIO);
+ else if (!errors_handled(ms))
+ bio_endio(bio, 0);
+ else
+ hold_bio(ms, bio);
}
}