/*
* raid1.c : Multiple Devices driver for Linux
*
*
*
* Better read-balancing code written by Mika Kuoppala <miku@iki.fi>, 2000
*
* Fixes to reconstruction by Jakob Østergaard" <jakob@ostenfeld.dk>
* Various fixes by Neil Brown <neilb@cse.unsw.edu.au>
*
* Changes by Peter T. Breuer <ptb@it.uc3m.es> 31/1/2003 to support
* bitmapped intelligence in resync:
*
* - bitmap marked during normal i/o
* - bitmap used to skip nondirty blocks during sync
*
* Additions to bitmap code, (C) 2003-2004 Paul Clements, SteelEye Technology:
* - persistent bitmap code
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2, or (at your option)
* any later version.
*
* You should have received a copy of the GNU General Public License
* (for example /usr/src/linux/COPYING); if not, write to the Free
* Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*/
#include <linux/slab.h>
#include <linux/delay.h>
#include <linux/blkdev.h>
#include <linux/seq_file.h>
#include "md.h"
#include "raid1.h"
#include "bitmap.h"
#define DEBUG 0
#if DEBUG
#define PRINTK(x...) printk(x)
#else
#define PRINTK(x...)
#endif
/*
* Number of guaranteed r1bios in case of extreme VM load:
*/
#define NR_RAID1_BIOS 256
static void allow_barrier(conf_t *conf);
static void lower_barrier(conf_t *conf);
static void * r1bio_pool_alloc(gfp_t gfp_flags, void *data)
{
struct pool_info *pi = data;
int size = offsetof(r1bio_t, bios[pi->raid_disks]);
/* allocate a r1bio with room for raid_disks entries in the bios array */
return kzalloc(size, gfp_flags);
}
static void r1bio_pool_free(void *r1_bio, void *data)
{
kfree(r1_bio);
}
#define RESYNC_BLOCK_SIZE (64*1024)
//#define RESYNC_BLOCK_SIZE PAGE_SIZE
#define RESYNC_SECTORS (RESYNC_BLOCK_SIZE >> 9)
#define RESYNC_PAGES ((RESYNC_BLOCK_SIZE + PAGE_SIZE-1) / PAGE_SIZE)
#define RESYNC_WINDOW (2048*1024)
static void * r1buf_pool_alloc(gfp_t gfp_flags, void *data)
{
struct pool_info *pi = data;
struct page *page;
r1bio_t *r1_bio;
struct bio *bio;
int i, j;
r1_bio = r1bio_pool_alloc(gfp_flags, pi);
if (!r1_bio)
return NULL;
/*
* Allocate bios : 1 for reading, n-1 for writing
*/
for (j = pi->raid_disks ; j-- ; ) {
bio = bio_kmalloc(gfp_flags, RESYNC_PAGES);
if (!bio)
goto out_free_bio;
r1_bio->bios[j] = bio;
}
/*
* Allocate RESYNC_PAGES data pages and attach them to
* the first bio.
* If this is a user-requested check/repair, allocate
* RESYNC_PAGES for each bio.
*/
if (test_bit(MD_RECOVERY_REQUESTED, &pi->mddev->recovery))
j = pi->raid_disks;
else
j = 1;
while(j--) {
bio = r1_bio->bios[j];
for (i = 0; i < RESYNC_PAGES; i++) {
page = alloc_page(gfp_flags);
if (unlikely(!page))
goto out_free_pages;
bio->bi_io_vec[i].bv_page = page;
bio->bi_vcnt = i+1;
}
}
/* If not user-requests, copy the page pointers to all bios */
if (!test_bit(MD_RECOVERY_REQUESTED, &pi->mddev->recovery)) {
for (i=0; i<RESYNC_PAGES ; i++)
for (j=1; j<pi->raid_disks; j++)
r1_bio->bios[j]->bi_io_vec[i].bv_page =
r1_bio->bios[0]->bi_io_vec[i].bv_page;
}
r1_bio->master_bio = NULL;
return r1_bio;
out_free_pages:
for (j=0 ; j < pi->raid_disks; j++)
for (i=0; i < r1_bio->bios[j]->bi_vcnt ; i++)
put_page(r1_bio->bios[j]->bi_io_vec[i].bv_page);
j = -1;
out_free_bio:
while ( ++j < pi->raid_disks )
bio_put(r1_bio->bios[j]);
r1bio_pool_free(r1_bio, data);
return NULL;
}
static void r1buf_pool_free(void *__r1_bio, void *data)
{
struct pool_info *pi = data;
int i,j;
r1bio_t *r1bio = __r1_bio;
for (i = 0; i < RESYNC_PAGES; i++)
for (j = pi->raid_disks; j-- ;) {
if (j == 0 ||
r1bio->bios[j]->bi_io_vec[i].bv_page !=
r1bio->bios[0]->bi_io_vec[i].bv_page)
safe_put_page(r1bio->bios[j]->bi_io_vec[i].bv_page);
}
for (i=0 ; i < pi->raid_disks; i++)
bio_put(r1bio->bios[i]);
r1bio_pool_free(r1bio, data);
}
static void put_all_bios(conf_t *conf, r1bio_t *r1_bio)
{
int i;
for (i = 0; i < conf->raid_disks; i++) {
struct bio **bio = r1_bio->bios + i;
if (*bio && *bio != IO_BLOCKED)
bio_put(*bio);
*bio = NULL;
}
}
static void free_r1bio(r1bio_t *r1_bio)
{
conf_t *conf = r1_bio->mddev->private;
/*
* Wake up any possible resync thread that waits for the device
* to go idle.
*/
allow_barrier(conf);
put_all_bios(conf, r1_bio);
mempool_free(r1_bio, conf->r1bio_pool);
}
static void put_buf(r1bio_t *r1_bio)
{
conf_t *conf = r1_bio->mddev->private;
int i;
for (i=0; i<conf->raid_disks; i++) {
struct bio *bio = r1_bio->bios[i];
if (bio->bi_end_io)
rdev_dec_pending(conf->mirrors[i].rdev, r1_bio->mddev);
}
mempool_free(r1_bio, conf->r1buf_pool);
lower_barrier(conf);
}
static void reschedule_retry(r1bio_t *r1_bio)
{
unsigned long flags;
mddev_t *mddev = r1_bio->mddev;
conf_t *conf = mddev->private;
spin_lock_irqsave(&conf->device_lock, flags);
list_add(&r1_bio->retry_list, &conf->retry_list);
conf->nr_queued ++;
spin_unlock_irqrestore(&conf->device_lock, flags);
wake_up(&conf->wait_barrier);
md_wakeup_thread(mddev->thread);
}
/*
* raid_end_bio_io() is called when we have finished servicing a mirrored
* operation and are ready to return a success/failure code to the buffer
* cache layer.
*/
static void raid_end_bio_io(r1bio_t *r1_bio)
{
struct bio *bio = r1_bio->master_bio;
/* if nobody has done the final endio yet, do it now */
if (!test_and_set_bit(R1BIO_Returned, &r1_bio->state)) {
PRINTK(KERN_DEBUG "raid1: sync end %s on sectors %llu-%llu\n",
(bio_data_dir(bio) == WRITE) ? "write" : "read",
(unsigned long long) bio->bi_sector,
(unsigned long long) bio->bi_sector +
(bio->bi_size >> 9) - 1);
bio_endio(bio,
test_bit(R1BIO_Uptodate, &r1_bio->state) ? 0 : -EIO);
}
free_r1bio(r1_bio);
}
/*
* Update disk head position estimator based on IRQ completion info.
*/
static inline void update_head_pos(int disk, r1bio_t *r1_bio)
{
conf_t *conf = r1_bio->mddev->private;
conf->mirrors[disk].head_position =
r1_bio->sector + (r1_bio->sectors);
}
static void raid1_end_read_request(struct bio *bio, int error)
{
int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
r1bio_t *r1_bio = bio->bi_private;
int mirror;
conf_t *conf = r1_bio->mddev->private;
mirror = r1_bio->read_disk;
/*
* this branch is our 'one mirror IO has finished' event handler:
*/
update_head_pos(mirror, r1_bio);
if (uptodate)
set_bit(R1BIO_Uptodate, &r1_bio->state);
else {
/* If all other devices have failed, we want to return
* the error upwards rather than fail the last device.
* Here we redefine "uptodate" to mean "Don't want to retry"
*/
unsigned long flags;
spin_lock_irqsave(&conf->device_lock, flags);
if (r1_bio->mddev->degraded == conf->raid_disks ||
(r1_bio->mddev->degraded == conf->raid_disks-1 &&
!test_bit(Faulty, &conf->mirrors[mirror].rdev->flags)))
uptodate = 1;
spin_unlock_irqrestore(&conf->device_lock, flags);
}
if (uptodate)
raid_end_bio_io(r1_bio);
else {
/*
* oops, read error:
*/
char b[BDEVNAME_SIZE];
if (printk_ratelimit())
printk(KERN_ERR "md/raid1:%s: %s: rescheduling sector %llu\n",
mdname(conf->mddev),
bdevname(conf->mirrors[mirror].rdev->bdev,b), (unsigned long long)r1_bio->sector);
reschedule_retry(r1_bio);
}
rdev_dec_pending(conf->mirrors[mirror].rdev, conf->mddev);
}
static void r1_bio_write_done(r1bio_t *r1_bio)
{
if (atomic_dec_and_test(&r1_bio->remaining))
{
/* it really is the end of this request */
if (test_bit(R1BIO_BehindIO, &r1_bio->state)) {
/* free extra copy of the data pages */
int i = r1_bio->behind_page_count;
while (i--)
safe_put_page(r1_bio->behind_pages[i]);
kfree(r1_bio->behind_pages);
r1_bio->behind_pages = NULL;
}
/* clear the bitmap if all writes complete successfully */
bitmap_endwr