/*
* Copyright (C) 2003 Sistina Software Limited.
* Copyright (C) 2005-2008 Red Hat, Inc. All rights reserved.
*
* This file is released under the GPL.
*/
#include "dm-bio-record.h"
#include <linux/init.h>
#include <linux/mempool.h>
#include <linux/module.h>
#include <linux/pagemap.h>
#include <linux/slab.h>
#include <linux/workqueue.h>
#include <linux/device-mapper.h>
#include <linux/dm-io.h>
#include <linux/dm-dirty-log.h>
#include <linux/dm-kcopyd.h>
#include <linux/dm-region-hash.h>
#define DM_MSG_PREFIX "raid1"
#define MAX_RECOVERY 1 /* Maximum number of regions recovered in parallel. */
#define DM_RAID1_HANDLE_ERRORS 0x01
#define errors_handled(p) ((p)->features & DM_RAID1_HANDLE_ERRORS)
static DECLARE_WAIT_QUEUE_HEAD(_kmirrord_recovery_stopped);
/*-----------------------------------------------------------------
* Mirror set structures.
*---------------------------------------------------------------*/
enum dm_raid1_error {
DM_RAID1_WRITE_ERROR,
DM_RAID1_FLUSH_ERROR,
DM_RAID1_SYNC_ERROR,
DM_RAID1_READ_ERROR
};
struct mirror {
struct mirror_set *ms;
atomic_t error_count;
unsigned long error_type;
struct dm_dev *dev;
sector_t offset;
};
struct mirror_set {
struct dm_target *ti;
struct list_head list;
uint64_t features;
spinlock_t lock; /* protects the lists */
struct bio_list reads;
struct bio_list writes;
struct bio_list failures;
struct bio_list holds; /* bios are waiting until suspend */
struct dm_region_hash *rh;
struct dm_kcopyd_client *kcopyd_client;
struct dm_io_client *io_client;
/* recovery */
region_t nr_regions;
int in_sync;
int log_failure;
int leg_failure;
atomic_t suspend;
atomic_t default_mirror; /* Default mirror */
struct workqueue_struct *kmirrord_wq;
struct work_struct kmirrord_work;
struct timer_list timer;
unsigned long timer_pending;
struct work_struct trigger_event;
unsigned nr_mirrors;
struct mirror mirror[0];
};
DECLARE_DM_KCOPYD_THROTTLE_WITH_MODULE_PARM(raid1_resync_throttle,
"A percentage of time allocated for raid resynchronization");
static void wakeup_mirrord(void *context)
{
struct mirror_set *ms = context;
queue_work(ms->kmirrord_wq, &ms->kmirrord_work);
}
static void delayed_wake_fn(unsigned long data)
{
struct mirror_set *ms = (struct mirror_set *) data;
clear_bit(0, &ms->timer_pending);
wakeup_mirrord(ms);
}
static void delayed_wake(struct mirror_set *ms)
{
if (test_and_set_bit(0, &ms->timer_pending))
return;
ms->timer.expires = jiffies + HZ / 5;
ms->timer.data = (unsigned long) ms;
ms->timer.function = delayed_wake_fn;
add_timer(&ms->timer);
}
static void wakeup_all_recovery_waiters(void *context)
{
wake_up_all(&_kmirrord_recovery_stopped);
}
static void queue_bio(struct mirror_set *ms, struct bio *bio, int rw)
{
unsigned long flags;
int should_wake = 0;
struct bio_list *bl;
bl = (rw == WRITE) ? &ms->writes : &ms->reads;
spin_lock_irqsave(&ms->lock, flags);
should_wake = !(bl->head);
bio_list_add(bl, bio);
spin_unlock_irqrestore(&ms->lock, flags);
if (should_wake)
wakeup_mirrord(ms);
}
static void dispatch_bios(void *context, struct bio_list *bio_list)
{
struct mirror_set *ms = context;
struct bio *bio;
while ((bio = bio_list_pop(bio_list)))
queue_bio(ms, bio, WRITE);
}
struct dm_raid1_bio_record {
struct mirror *m;
/* if details->bi_bdev == NULL, details were not saved */
struct dm_bio_details details;
region_t write_region;
};
/*
* Every mirror should look like this one.
*/
#define DEFAULT_MIRROR 0
/*
* This is yucky. We squirrel the mirror struct away inside
* bi_next for read/write buffers. This is safe since the bh
* doesn't get submitted to the lower levels of block layer.
*/
static struct mirror *bio_get_m(struct bio *bio)
{
return (struct mirror *) bio->bi_next;
}
static void bio_set_m(struct bio *bio, struct mirror *m)
{
bio->bi_next = (struct bio *) m;
}
static struct mirror *get_default_mirror(struct mirror_set *ms)
{
return &ms->mirror[atomic_read(&ms->default_mirror)];
}
static void set_default_mirror(struct mirror *m)
{
struct mirror_set *ms = m->ms;
struct mirror *m0 = &(ms->mirror[0]);
atomic_set(&ms->default_mirror, m - m0);
}
static struct mirror *get_valid_mirror(struct mirror_set *ms)
{
struct mirror *m;
for (m = ms->mirror; m < ms->mirror + ms->nr_mirrors; m++)
if (!atomic_read(&m->error_count))
return m;
return NULL;
}
/* fail_mirror
* @m: mirror device to fail
* @error_type: one of the enum's, DM_RAID1_*_ERROR
*
* If errors are being handled, record the type of
* error encountered for this device. If this type
* of error has already been recorded, we can return;
* otherwise, we must signal userspace by triggering
* an event. Additionally, if the device is the
* primary device, we must choose a new primary, but
* only if the mirror is in-sync.
*
* This function must not block.
*/
static void fail_mirror(struct mirror *m, enum dm_raid1_error error_type)
{
struct mirror_set *ms = m->ms;
struct mirror *new;
ms->leg_failure = 1;
/*
* error_count is used for nothing more than a
* simple way to tell if a device has encountered
* errors.
*/
atomic_inc(&m->error_count);
if (test_and_set_bit(error_type, &m->error_type))
return;
if (!errors_handled(ms))
return;
if (m != get_default_mirror(ms))
goto out;
if (!ms->in_sync) {
/*
* Better to issue requests to same failing device
* than to risk returning corrupt data.
*/
DMERR("Primary mirror (%s) failed while out-of-sync: "
"Reads may fail.", m->dev->name);
goto out;
}
new = get_valid_mirror(ms);
if (new)
set_default_mirror(new);
else
DMWARN("All sides of mirror have failed.");
out:
schedule_work(&ms->trigger_event);
}
static int mirror_flush(struct dm_target *ti)
{
struct mirror_set *ms = ti->private;
unsigned long error_bits;
unsigned int i;
struct dm_io_region io[ms->nr_mirrors];
struct mirror *m;
struct dm_io_request io_req = {
.bi_rw = WRITE_FLUSH,
.mem.type = DM_IO_KMEM,
.mem.ptr.addr = NULL,
.client = ms->io_client,
};
for (i = 0, m = ms->mirror; i < ms->nr_mirrors; i++, m++) {
io[i].bdev = m->dev->bdev;
io[i].sector = 0;
io[i].count = 0;
}
error_bits = -1;
dm_io(&io_req, ms->nr_mirrors, io, &error_bits);
if (unlikely(error_bits != 0)) {
for (i = 0; i < ms->nr_mirrors; i++)
if (test_bit(i, &error_bits))
fail_mirror(ms->mirror + i,
DM_RAID1_FLUSH_ERROR);
return -EIO;
}
return 0;
}
/*-----------------------------------------------------------------
* Recovery.
*
* When a mirror is first activated we may find that some regions
* are in the no-sync state. We have to recover these by
* recopying from the default mirror to all the others.
*---------------------------------------------------------------*/
static void recovery_complete(int read_err, unsigned long write_err,
void *context)
{
struct dm_region *reg = context;
struct mirror_set *ms = dm_rh_region_context(reg);
int m, bit = 0;
if (read_err) {
/* Read error means the failure of default mirror. */
DMERR_LIMIT("Unable to read primary mirror during recovery");
fail_mirror(get_default_mirror(ms), DM_RAID1_SYNC_ERROR);
}
if (write_err) {
DMERR_LIMIT("Write error during recovery (error = 0x%lx)",
write_err);
/*
* Bits correspond to devices (excluding default mirror).
* The default mirror cannot change during recovery.
*/
for (m = 0; m < ms->nr_mirrors; m++) {
if (&ms->mirror[m] == get_default_mirror(ms))
continue;
if (test_bit(bit, &write_err))
fail_mirror(ms->mirror + m,
DM_RAID1_SYNC_ERROR);
bit++;
}
}
dm_rh_recovery_end(reg, !(read_err || write_err));
}
static int recover(struct mirror_set *ms, struct dm_region *reg)
{
int r;
unsigned i;
struct dm_io_region from, to[DM_KCOPYD_MAX_REGIONS], *dest;
struct mirror *m;
unsigned long flags = 0;
region_t key = dm_rh_get_region_key(reg);
sector_t region_size = dm_rh_get_region_size(ms->rh);
/*