/*
* An async IO implementation for Linux
* Written by Benjamin LaHaise <[email protected]>
*
* Implements an efficient asynchronous io interface.
*
* Copyright 2000, 2001, 2002 Red Hat, Inc. All Rights Reserved.
*
* See ../COPYING for licensing terms.
*/
#define pr_fmt(fmt) "%s: " fmt, __func__
#include <linux/kernel.h>
#include <linux/init.h>
#include <linux/errno.h>
#include <linux/time.h>
#include <linux/aio_abi.h>
#include <linux/export.h>
#include <linux/syscalls.h>
#include <linux/backing-dev.h>
#include <linux/uio.h>
#include <linux/sched.h>
#include <linux/fs.h>
#include <linux/file.h>
#include <linux/mm.h>
#include <linux/mman.h>
#include <linux/mmu_context.h>
#include <linux/percpu.h>
#include <linux/slab.h>
#include <linux/timer.h>
#include <linux/aio.h>
#include <linux/highmem.h>
#include <linux/workqueue.h>
#include <linux/security.h>
#include <linux/eventfd.h>
#include <linux/blkdev.h>
#include <linux/compat.h>
#include <linux/migrate.h>
#include <linux/ramfs.h>
#include <linux/percpu-refcount.h>
#include <linux/mount.h>
#include <asm/kmap_types.h>
#include <asm/uaccess.h>
#include "internal.h"
#define AIO_RING_MAGIC 0xa10a10a1
#define AIO_RING_COMPAT_FEATURES 1
#define AIO_RING_INCOMPAT_FEATURES 0
struct aio_ring {
unsigned id; /* kernel internal index number */
unsigned nr; /* number of io_events */
unsigned head; /* Written to by userland or under ring_lock
* mutex by aio_read_events_ring(). */
unsigned tail;
unsigned magic;
unsigned compat_features;
unsigned incompat_features;
unsigned header_length; /* size of aio_ring */
struct io_event io_events[0];
}; /* 128 bytes + ring size */
#define AIO_RING_PAGES 8
struct kioctx_table {
struct rcu_head rcu;
unsigned nr;
struct kioctx *table[];
};
struct kioctx_cpu {
unsigned reqs_available;
};
struct kioctx {
struct percpu_ref users;
atomic_t dead;
struct percpu_ref reqs;
unsigned long user_id;
struct __percpu kioctx_cpu *cpu;
/*
* For percpu reqs_available, number of slots we move to/from global
* counter at a time:
*/
unsigned req_batch;
/*
* This is what userspace passed to io_setup(), it's not used for
* anything but counting against the global max_reqs quota.
*
* The real limit is nr_events - 1, which will be larger (see
* aio_setup_ring())
*/
unsigned max_reqs;
/* Size of ringbuffer, in units of struct io_event */
unsigned nr_events;
unsigned long mmap_base;
unsigned long mmap_size;
struct page **ring_pages;
long nr_pages;
struct work_struct free_work;
/*
* signals when all in-flight requests are done
*/
struct completion *requests_done;
struct {
/*
* This counts the number of available slots in the ringbuffer,
* so we avoid overflowing it: it's decremented (if positive)
* when allocating a kiocb and incremented when the resulting
* io_event is pulled off the ringbuffer.
*
* We batch accesses to it with a percpu version.
*/
atomic_t reqs_available;
} ____cacheline_aligned_in_smp;
struct {
spinlock_t ctx_lock;
struct list_head active_reqs; /* used for cancellation */
} ____cacheline_aligned_in_smp;
struct {
struct mutex ring_lock;
wait_queue_head_t wait;
} ____cacheline_aligned_in_smp;
struct {
unsigned tail;
unsigned completed_events;
spinlock_t completion_lock;
} ____cacheline_aligned_in_smp;
struct page *internal_pages[AIO_RING_PAGES];
struct file *aio_ring_file;
unsigned id;
};
/*------ sysctl variables----*/
static DEFINE_SPINLOCK(aio_nr_lock);
unsigned long aio_nr; /* current system wide number of aio requests */
unsigned long aio_max_nr = 0x10000; /* system wide maximum number of aio requests */
/*----end sysctl variables---*/
static struct kmem_cache *kiocb_cachep;
static struct kmem_cache *kioctx_cachep;
static struct vfsmount *aio_mnt;
static const struct file_operations aio_ring_fops;
static const struct address_space_operations aio_ctx_aops;
/* Backing dev info for aio fs.
* -no dirty page accounting or writeback happens
*/
static struct backing_dev_info aio_fs_backing_dev_info = {
.name = "aiofs",
.state = 0,
.capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK | BDI_CAP_MAP_COPY,
};
static struct file *aio_private_file(struct kioctx *ctx, loff_t nr_pages)
{
struct qstr this = QSTR_INIT("[aio]", 5);
struct file *file;
struct path path;
struct inode *inode = alloc_anon_inode(aio_mnt->mnt_sb);
if (IS_ERR(inode))
return ERR_CAST(inode);
inode->i_mapping->a_ops = &aio_ctx_aops;
inode->i_mapping->private_data = ctx;
inode->i_mapping->backing_dev_info = &aio_fs_backing_dev_info;
inode->i_size = PAGE_SIZE * nr_pages;
path.dentry = d_alloc_pseudo(aio_mnt->mnt_sb, &this);
if (!path.dentry) {
iput(inode);
return ERR_PTR(-ENOMEM);
}
path.mnt = mntget(aio_mnt);
d_instantiate(path.dentry, inode);
file = alloc_file(&path, FMODE_READ | FMODE_WRITE, &aio_ring_fops);
if (IS_ERR(file)) {
path_put(&path);
return file;
}
file->f_flags = O_RDWR;
return file;
}
static struct dentry *aio_mount(struct file_system_type *fs_type,
int flags, const char *dev_name, void *data)
{
static const struct dentry_operations ops = {
.d_dname = simple_dname,
};
return mount_pseudo(fs_type, "aio:", NULL, &ops, AIO_RING_MAGIC);
}
/* aio_setup
* Creates the slab caches used by the aio routines, panic on
* failure as this is done early during the boot sequence.
*/
static int __init aio_setup(void)
{
static struct file_system_type aio_fs = {
.name = "aio",
.mount = aio_mount,
.kill_sb = kill_anon_super,
};
aio_mnt = kern_mount(&aio_fs);
if (IS_ERR(aio_mnt))
panic("Failed to create aio fs mount.");
if (bdi_init(&aio_fs_backing_dev_info))
panic("Failed to init aio fs backing dev info.");
kiocb_cachep = KMEM_CACHE(kiocb, SLAB_HWCACHE_ALIGN|SLAB_PANIC);
kioctx_cachep = KMEM_CACHE(kioctx,SLAB_HWCACHE_ALIGN|SLAB_PANIC);
pr_debug("sizeof(struct page) = %zu\n", sizeof(struct page));
return 0;
}
__initcall(aio_setup);
static void put_aio_ring_file(struct kioctx *ctx)
{
struct file *aio_ring_file = ctx->aio_ring_file;
if (aio_ring_file) {
truncate_setsize(aio_ring_file->f_inode, 0);
/* Prevent further access to the kioctx from migratepages */
spin_lock(&aio_ring_file->f_inode->i_mapping->private_lock);
aio_ring_file->f_inode->i_mapping->private_data = NULL;
ctx->aio_ring_file = NULL;
spin_unlock(&aio_ring_file->f_inode->i_mapping->private_lock);
fput(aio_ring_file);
}
}
static void aio_free_ring(struct kioctx *ctx)
{
int i;
/* Disconnect the kiotx from the ring file. This prevents future
* accesses to the kioctx from page migration.
*/
put_aio_ring_file(ctx);
for (i = 0; i < ctx->nr_pages; i++) {
struct page *page;
pr_debug("pid(%d) [%d] page->count=%d\n", current->pid, i,
page_count(ctx->ring_pages[i]));
page = ctx->ring_pages[i];
if (!page)
continue;
ctx->ring_pages[i] = NULL;
put_page(page);
}
if (ctx->ring_pages && ctx->ring_pages != ctx->internal_pages) {
kfree(ctx->ring_pages);
ctx->ring_pages = NULL;
}
}
static int aio_ring_mmap(struct file *file, struct vm_area_struct *vma)
{
vma->vm_flags |= VM_DONTEXPAND;
vma->vm_ops = &generic_file_vm_ops;
return 0;
}
static void aio_ring_remap(struct file *file, struct vm_area_struct *vma)
{
struct mm_struct *mm = vma->vm_mm;
struct kioctx_table *table;
int i;
spin_lock(&mm->ioctx_lock);
rcu_read_lock();
table = rcu_dereference(mm->ioctx_table);
for (i = 0; i < table->nr; i++) {
struct kioctx *ctx;
ctx = table->table[i];
if (ctx && ctx->aio_ring_file == file) {
ctx->user_id = ctx->mmap_base = vma->vm_start;
break;
}
}
rcu_read_unlock();
spin_unlock(&mm->ioctx_lock);
}
static const struct file_operations aio_ring_fops = {
.mmap = aio_ring_mmap,
.mremap = aio_ring_remap,
};
#if IS_ENABLED(CONFIG_MIGRATION)
static int aio_migratepage(struct address_space *mapping, struct page *new,
struct page *old, enum migrate_mode mode)
{
struct