/*
* Network block device - make block devices work over TCP
*
* Note that you can not swap over this thing, yet. Seems to work but
* deadlocks sometimes - you can not swap over TCP in general.
*
* Copyright 1997-2000, 2008 Pavel Machek <pavel@ucw.cz>
* Parts copyright 2001 Steven Whitehouse <steve@chygwyn.com>
*
* This file is released under GPLv2 or later.
*
* (part of code stolen from loop.c)
*/
#include <linux/major.h>
#include <linux/blkdev.h>
#include <linux/module.h>
#include <linux/init.h>
#include <linux/sched.h>
#include <linux/fs.h>
#include <linux/bio.h>
#include <linux/stat.h>
#include <linux/errno.h>
#include <linux/file.h>
#include <linux/ioctl.h>
#include <linux/mutex.h>
#include <linux/compiler.h>
#include <linux/err.h>
#include <linux/kernel.h>
#include <linux/slab.h>
#include <net/sock.h>
#include <linux/net.h>
#include <linux/kthread.h>
#include <asm/uaccess.h>
#include <asm/types.h>
#include <linux/nbd.h>
#define NBD_MAGIC 0x68797548
#ifdef NDEBUG
#define dprintk(flags, fmt...)
#else /* NDEBUG */
#define dprintk(flags, fmt...) do { \
if (debugflags & (flags)) printk(KERN_DEBUG fmt); \
} while (0)
#define DBG_IOCTL 0x0004
#define DBG_INIT 0x0010
#define DBG_EXIT 0x0020
#define DBG_BLKDEV 0x0100
#define DBG_RX 0x0200
#define DBG_TX 0x0400
static unsigned int debugflags;
#endif /* NDEBUG */
static unsigned int nbds_max = 16;
static struct nbd_device *nbd_dev;
static int max_part;
/*
* Use just one lock (or at most 1 per NIC). Two arguments for this:
* 1. Each NIC is essentially a synchronization point for all servers
* accessed through that NIC so there's no need to have more locks
* than NICs anyway.
* 2. More locks lead to more "Dirty cache line bouncing" which will slow
* down each lock to the point where they're actually slower than just
* a single lock.
* Thanks go to Jens Axboe and Al Viro for their LKML emails explaining this!
*/
static DEFINE_SPINLOCK(nbd_lock);
#ifndef NDEBUG
static const char *ioctl_cmd_to_ascii(int cmd)
{
switch (cmd) {
case NBD_SET_SOCK: return "set-sock";
case NBD_SET_BLKSIZE: return "set-blksize";
case NBD_SET_SIZE: return "set-size";
case NBD_SET_TIMEOUT: return "set-timeout";
case NBD_SET_FLAGS: return "set-flags";
case NBD_DO_IT: return "do-it";
case NBD_CLEAR_SOCK: return "clear-sock";
case NBD_CLEAR_QUE: return "clear-que";
case NBD_PRINT_DEBUG: return "print-debug";
case NBD_SET_SIZE_BLOCKS: return "set-size-blocks";
case NBD_DISCONNECT: return "disconnect";
case BLKROSET: return "set-read-only";
case BLKFLSBUF: return "flush-buffer-cache";
}
return "unknown";
}
static const char *nbdcmd_to_ascii(int cmd)
{
switch (cmd) {
case NBD_CMD_READ: return "read";
case NBD_CMD_WRITE: return "write";
case NBD_CMD_DISC: return "disconnect";
case NBD_CMD_FLUSH: return "flush";
case NBD_CMD_TRIM: return "trim/discard";
}
return "invalid";
}
#endif /* NDEBUG */
static void nbd_end_request(struct request *req)
{
int error = req->errors ? -EIO : 0;
struct request_queue *q = req->q;
unsigned long flags;
dprintk(DBG_BLKDEV, "%s: request %p: %s\n", req->rq_disk->disk_name,
req, error ? "failed" : "done");
spin_lock_irqsave(q->queue_lock, flags);
__blk_end_request_all(req, error);
spin_unlock_irqrestore(q->queue_lock, flags);
}
static void sock_shutdown(struct nbd_device *nbd, int lock)
{
/* Forcibly shutdown the socket causing all listeners
* to error
*
* FIXME: This code is duplicated from sys_shutdown, but
* there should be a more generic interface rather than
* calling socket ops directly here */
if (lock)
mutex_lock(&nbd->tx_lock);
if (nbd->sock) {
dev_warn(disk_to_dev(nbd->disk), "shutting down socket\n");
kernel_sock_shutdown(nbd->sock, SHUT_RDWR);
nbd->sock = NULL;
}
if (lock)
mutex_unlock(&nbd->tx_lock);
}
static void nbd_xmit_timeout(unsigned long arg)
{
struct task_struct *task = (struct task_struct *)arg;
printk(KERN_WARNING "nbd: killing hung xmit (%s, pid: %d)\n",
task->comm, task->pid);
force_sig(SIGKILL, task);
}
/*
* Send or receive packet.
*/
static int sock_xmit(struct nbd_device *nbd, int send, void *buf, int size,
int msg_flags)
{
struct socket *sock = nbd->sock;
int result;
struct msghdr msg;
struct kvec iov;
sigset_t blocked, oldset;
unsigned long pflags = current->flags;
if (unlikely(!sock)) {
dev_err(disk_to_dev(nbd->disk),
"Attempted %s on closed socket in sock_xmit\n",
(send ? "send" : "recv"));
return -EINVAL;
}
/* Allow interception of SIGKILL only
* Don't allow other signals to interrupt the transmission */
siginitsetinv(&blocked, sigmask(SIGKILL));
sigprocmask(SIG_SETMASK, &blocked, &oldset);
current->flags |= PF_MEMALLOC;
do {
sock->sk->sk_allocation = GFP_NOIO | __GFP_MEMALLOC;
iov.iov_base = buf;
iov.iov_len = size;
msg.msg_name = NULL;
msg.msg_namelen = 0;
msg.msg_control = NULL;
msg.msg_controllen = 0;
msg.msg_flags = msg_flags | MSG_NOSIGNAL;
if (send) {
struct timer_list ti;
if (nbd->xmit_timeout) {
init_timer(&ti);
ti.function = nbd_xmit_timeout;
ti.data = (unsigned long)current;
ti.expires = jiffies + nbd->xmit_timeout;
add_timer(&ti);
}
result = kernel_sendmsg(sock, &msg, &iov, 1, size);
if (nbd->xmit_timeout)
del_timer_sync(&ti);
} else
result = kernel_recvmsg(sock, &msg, &iov, 1, size,
msg.msg_flags);
if (signal_pending(current)) {
siginfo_t info;
printk(KERN_WARNING "nbd (pid %d: %s) got signal %d\n",
task_pid_nr(current), current->comm,
dequeue_signal_lock(current, ¤t->blocked, &info));
result = -EINTR;
sock_shutdown(nbd, !send);
break;
}
if (result <= 0) {
if (result == 0)
result = -EPIPE; /* short read */
break;
}
size -= result;
buf += result;
} while (size > 0);
sigprocmask(SIG_SETMASK, &oldset, NULL);
tsk_restore_flags(current, pflags, PF_MEMALLOC);
return result;
}
static inline int sock_send_bvec(struct nbd_device *nbd, struct bio_vec *bvec,
int flags)
{
int result;
void *kaddr = kmap(bvec->bv_page);
result = sock_xmit(nbd, 1, kaddr + bvec->bv_offset,
bvec->bv_len, flags);
kunmap(bvec->bv_page);
return result;
}
/* always call with the tx_lock held */
static int nbd_send_req(struct nbd_device *nbd, struct request *req)
{
int result, flags;
struct nbd_request request;
unsigned long size = blk_rq_bytes(req);
memset(&request, 0, sizeof(request));
request.magic = htonl(NBD_REQUEST_MAGIC);
request.type = htonl(nbd_cmd(req));
if (nbd_cmd(req) != NBD_CMD_FLUSH && nbd_cmd(req) != NBD_CMD_DISC) {
request.from = cpu_to_be64((u64)blk_rq_pos(req) << 9);
request.len = htonl(size);
}
memcpy(request.handle, &req, sizeof(req));
dprintk(DBG_TX, "%s: request %p: sending control (%s@%llu,%uB)\n",
nbd->disk->disk_name, req,
nbdcmd_to_ascii(nbd_cmd(req)),
(unsigned long long)blk_rq_pos(req) << 9,
blk_rq_bytes(req));
result = sock_xmit(nbd, 1, &request, sizeof(request),
(nbd_cmd(req) == NBD_CMD_WRITE) ? MSG_MORE : 0);
if (result <= 0) {
dev_err(disk_to_dev(nbd->disk),
"Send control failed (result %d)\n", result);
goto error_out;
}
if (nbd_cmd(req) == NBD_CMD_WRITE) {
struct req_iterator iter;
struct bio_vec bvec;
/*
* we are really probing at internals to determine
* whether to set MSG_MORE or not...
*/
rq_for_each_segment(bvec, req, iter) {
flags = 0;
if (!rq_iter_last(bvec, iter))
flags = MSG_MORE;
dprintk(DBG_TX, "%s: request %p: sending %d bytes data\n",
nbd->disk->disk_name, req, bvec.bv_len);
result = sock_send_bvec(nbd, &bvec, flags);
if (result <= 0) {
dev_err(disk_to_dev(nbd->disk),
"Send data failed (result %d)\n",
result);
goto error_out;
}
}
}
return 0;
error_out:
return -EIO;
}
static struct request *nbd_find_request(struct nbd_device *nbd,
struct request *xreq)
{
struct request *req, *tmp;
int err;
err = wait_event_inte