/*
* CFQ, or complete fairness queueing, disk scheduler.
*
* Based on ideas from a previously unfinished io
* scheduler (round robin per-process disk scheduling) and Andrea Arcangeli.
*
* Copyright (C) 2003 Jens Axboe <axboe@kernel.dk>
*/
#include <linux/module.h>
#include <linux/slab.h>
#include <linux/blkdev.h>
#include <linux/elevator.h>
#include <linux/jiffies.h>
#include <linux/rbtree.h>
#include <linux/ioprio.h>
#include <linux/blktrace_api.h>
#include "blk.h"
#include "blk-cgroup.h"
/*
* tunables
*/
/* max queue in one round of service */
static const int cfq_quantum = 8;
static const int cfq_fifo_expire[2] = { HZ / 4, HZ / 8 };
/* maximum backwards seek, in KiB */
static const int cfq_back_max = 16 * 1024;
/* penalty of a backwards seek */
static const int cfq_back_penalty = 2;
static const int cfq_slice_sync = HZ / 10;
static int cfq_slice_async = HZ / 25;
static const int cfq_slice_async_rq = 2;
static int cfq_slice_idle = HZ / 125;
static int cfq_group_idle = HZ / 125;
static const int cfq_target_latency = HZ * 3/10; /* 300 ms */
static const int cfq_hist_divisor = 4;
/*
* offset from end of service tree
*/
#define CFQ_IDLE_DELAY (HZ / 5)
/*
* below this threshold, we consider thinktime immediate
*/
#define CFQ_MIN_TT (2)
#define CFQ_SLICE_SCALE (5)
#define CFQ_HW_QUEUE_MIN (5)
#define CFQ_SERVICE_SHIFT 12
#define CFQQ_SEEK_THR (sector_t)(8 * 100)
#define CFQQ_CLOSE_THR (sector_t)(8 * 1024)
#define CFQQ_SECT_THR_NONROT (sector_t)(2 * 32)
#define CFQQ_SEEKY(cfqq) (hweight32(cfqq->seek_history) > 32/8)
#define RQ_CIC(rq) icq_to_cic((rq)->elv.icq)
#define RQ_CFQQ(rq) (struct cfq_queue *) ((rq)->elv.priv[0])
#define RQ_CFQG(rq) (struct cfq_group *) ((rq)->elv.priv[1])
static struct kmem_cache *cfq_pool;
#define CFQ_PRIO_LISTS IOPRIO_BE_NR
#define cfq_class_idle(cfqq) ((cfqq)->ioprio_class == IOPRIO_CLASS_IDLE)
#define cfq_class_rt(cfqq) ((cfqq)->ioprio_class == IOPRIO_CLASS_RT)
#define sample_valid(samples) ((samples) > 80)
#define rb_entry_cfqg(node) rb_entry((node), struct cfq_group, rb_node)
struct cfq_ttime {
unsigned long last_end_request;
unsigned long ttime_total;
unsigned long ttime_samples;
unsigned long ttime_mean;
};
/*
* Most of our rbtree usage is for sorting with min extraction, so
* if we cache the leftmost node we don't have to walk down the tree
* to find it. Idea borrowed from Ingo Molnars CFS scheduler. We should
* move this into the elevator for the rq sorting as well.
*/
struct cfq_rb_root {
struct rb_root rb;
struct rb_node *left;
unsigned count;
u64 min_vdisktime;
struct cfq_ttime ttime;
};
#define CFQ_RB_ROOT (struct cfq_rb_root) { .rb = RB_ROOT, \
.ttime = {.last_end_request = jiffies,},}
/*
* Per process-grouping structure
*/
struct cfq_queue {
/* reference count */
int ref;
/* various state flags, see below */
unsigned int flags;
/* parent cfq_data */
struct cfq_data *cfqd;
/* service_tree member */
struct rb_node rb_node;
/* service_tree key */
unsigned long rb_key;
/* prio tree member */
struct rb_node p_node;
/* prio tree root we belong to, if any */
struct rb_root *p_root;
/* sorted list of pending requests */
struct rb_root sort_list;
/* if fifo isn't expired, next request to serve */
struct request *next_rq;
/* requests queued in sort_list */
int queued[2];
/* currently allocated requests */
int allocated[2];
/* fifo list of requests in sort_list */
struct list_head fifo;
/* time when queue got scheduled in to dispatch first request. */
unsigned long dispatch_start;
unsigned int allocated_slice;
unsigned int slice_dispatch;
/* time when first request from queue completed and slice started. */
unsigned long slice_start;
unsigned long slice_end;
long slice_resid;
/* pending priority requests */
int prio_pending;
/* number of requests that are on the dispatch list or inside driver */
int dispatched;
/* io prio of this group */
unsigned short ioprio, org_ioprio;
unsigned short ioprio_class;
pid_t pid;
u32 seek_history;
sector_t last_request_pos;
struct cfq_rb_root *service_tree;
struct cfq_queue *new_cfqq;
struct cfq_group *cfqg;
/* Number of sectors dispatched from queue in single dispatch round */
unsigned long nr_sectors;
};
/*
* First index in the service_trees.
* IDLE is handled separately, so it has negative index
*/
enum wl_class_t {
BE_WORKLOAD = 0,
RT_WORKLOAD = 1,
IDLE_WORKLOAD = 2,
CFQ_PRIO_NR,
};
/*
* Second index in the service_trees.
*/
enum wl_type_t {
ASYNC_WORKLOAD = 0,
SYNC_NOIDLE_WORKLOAD = 1,
SYNC_WORKLOAD = 2
};
struct cfqg_stats {
#ifdef CONFIG_CFQ_GROUP_IOSCHED
/* total bytes transferred */
struct blkg_rwstat service_bytes;
/* total IOs serviced, post merge */
struct blkg_rwstat serviced;
/* number of ios merged */
struct blkg_rwstat merged;
/* total time spent on device in ns, may not be accurate w/ queueing */
struct blkg_rwstat service_time;
/* total time spent waiting in scheduler queue in ns */
struct blkg_rwstat wait_time;
/* number of IOs queued up */
struct blkg_rwstat queued;
/* total sectors transferred */
struct blkg_stat sectors;
/* total disk time and nr sectors dispatched by this group */
struct blkg_stat time;
#ifdef CONFIG_DEBUG_BLK_CGROUP
/* time not charged to this cgroup */
struct blkg_stat unaccounted_time;
/* sum of number of ios queued across all samples */
struct blkg_stat avg_queue_size_sum;
/* count of samples taken for average */
struct blkg_stat avg_queue_size_samples;
/* how many times this group has been removed from service tree */
struct blkg_stat dequeue;
/* total time spent waiting for it to be assigned a timeslice. */
struct blkg_stat group_wait_time;
/* time spent idling for this blkcg_gq */
struct blkg_stat idle_time;
/* total time with empty current active q with other requests queued */
struct blkg_stat empty_time;
/* fields after this shouldn't be cleared on stat reset */
uint64_t start_group_wait_time;
uint64_t start_idle_time;
uint64_t start_empty_time;
uint16_t flags;
#endif /* CONFIG_DEBUG_BLK_CGROUP */
#endif /* CONFIG_CFQ_GROUP_IOSCHED */
};
/* This is per cgroup per device grouping structure */
struct cfq_group {
/* must be the first member */
struct blkg_policy_data pd;
/* group service_tree member */
struct rb_node rb_node;
/* group service_tree key */
u64 vdisktime;
/*
* The number of active cfqgs and sum of their weights under this
* cfqg. This covers this cfqg's leaf_weight and all children's
* weights, but does not cover weights of further descendants.
*
* If a cfqg is on the service tree, it's active. An active cfqg
* also activates its parent and contributes to the children_weight
* of the parent.
*/
int nr_active;
unsigned int children_weight;
/*
* vfraction is the fraction of vdisktime that the tasks in this
* cfqg are entitled to. This is determined by compounding the
* ratios walking up from this cfqg to the root.
*
* It is in fixed point w/ CFQ_SERVICE_SHIFT and the sum of all
* vfractions on a service tree is approximately 1. The sum may
* deviate a bit due to rounding errors and fluctuations caused by
* cfqgs entering and leaving the service tree.
*/
unsigned int vfraction;
/*
* There are two weights - (internal) weight is the weight of this
* cfqg against the sibling cfqgs. leaf_weight is the wight of
* this cfqg against the child cfqgs. For the root cfqg, both
* weights are kept in sync for backward compatibility.
*/
unsigned int weight;
unsigned int new_weight;
unsigned int dev_weight;
unsigned int leaf_weight;
unsigned int new_leaf_weight;
unsigned int dev_leaf_weight;
/* number of cfqq currently on this group */
int nr_cfqq;
/*
* Per group busy queues average. Useful for workload slice calc. We
* create the array for each prio class but at run time it is used
* only for RT and BE class and slot for IDLE class remains unused.
* This is primarily done to a