/*
* bootchart-collector - collection framework.
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2, or (at your option)
* any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; see the file COPYING. If not, write to
* the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
*
* Author: Michael Meeks <michael.meeks@novell.com>
* Copyright (C) 2009-2010 Novell, Inc.
* inspired by Scott James Remnant <scott@netsplit.com>'s work.
*/
/* getdelays.c
*
* Utility to get per-pid and per-tgid delay accounting statistics
* Also illustrates usage of the taskstats interface
*
* Copyright (C) Shailabh Nagar, IBM Corp. 2005
* Copyright (C) Balbir Singh, IBM Corp. 2006
* Copyright (c) Jay Lan, SGI. 2006
*/
#include "common.h"
#include <sys/mount.h>
#include <sys/sysmacros.h>
#include <linux/fs.h>
#include <linux/genetlink.h>
#include <linux/taskstats.h>
#include <linux/cgroupstats.h>
#include <signal.h>
#include <sys/sysmacros.h>
/* pid uniqifying code */
typedef struct {
pid_t pid;
pid_t ppid;
__u64 time_total;
} PidEntry;
static inline PidEntry *
get_pid_entry (pid_t pid)
{
static PidEntry *pids = NULL;
static pid_t pids_size = 0;
pid_t old_pids_size = pids_size;
if (pid >= pids_size) {
pids_size = pid + 512;
pids = realloc (pids, sizeof (PidEntry) * pids_size);
memset (pids + old_pids_size, 0, sizeof (PidEntry) * (pids_size - old_pids_size));
}
return pids + pid;
}
/* Netlink socket-set bits */
static int netlink_socket = -1;
static __u16 netlink_taskstats_id;
#define GENLMSG_DATA(glh) ((void *)(NLMSG_DATA(glh) + GENL_HDRLEN))
#define GENLMSG_PAYLOAD(glh) (NLMSG_PAYLOAD(glh, 0) - GENL_HDRLEN)
#define NLA_DATA(na) ((void *)((char*)(na) + NLA_HDRLEN))
#define NLA_PAYLOAD(len) (len - NLA_HDRLEN)
/* Maximum size of response requested or message sent */
#define MAX_MSG_SIZE 1024
struct msgtemplate {
struct nlmsghdr n;
struct genlmsghdr g;
char buf[MAX_MSG_SIZE];
};
static int send_cmd(int sd, __u16 nlmsg_type, __u32 nlmsg_pid,
__u8 genl_cmd, __u16 nla_type,
void *nla_data, int nla_len)
{
struct nlattr *na;
struct sockaddr_nl nladdr;
int r, buflen;
char *buf;
struct msgtemplate msg = { { 0, } };
msg.n.nlmsg_len = NLMSG_LENGTH(GENL_HDRLEN);
msg.n.nlmsg_type = nlmsg_type;
msg.n.nlmsg_flags = NLM_F_REQUEST;
msg.n.nlmsg_seq = 0;
msg.n.nlmsg_pid = nlmsg_pid;
msg.g.cmd = genl_cmd;
msg.g.version = 0x1;
na = (struct nlattr *) GENLMSG_DATA(&msg);
na->nla_type = nla_type;
na->nla_len = nla_len + 1 + NLA_HDRLEN;
memcpy(NLA_DATA(na), nla_data, nla_len);
msg.n.nlmsg_len += NLMSG_ALIGN(na->nla_len);
buf = (char *) &msg;
buflen = msg.n.nlmsg_len ;
memset(&nladdr, 0, sizeof(nladdr));
nladdr.nl_family = AF_NETLINK;
while ((r = sendto(sd, buf, buflen, 0, (struct sockaddr *) &nladdr,
sizeof(nladdr))) < buflen) {
if (r > 0) {
buf += r;
buflen -= r;
} else if (errno != EAGAIN)
return -1;
}
return 0;
}
static struct taskstats *
wait_taskstats (void)
{
static struct msgtemplate msg;
int rep_len;
for (;;) {
while ((rep_len = recv (netlink_socket, &msg, sizeof(msg), 0)) < 0 && errno == EINTR);
if (msg.n.nlmsg_type == NLMSG_ERROR ||
!NLMSG_OK((&msg.n), rep_len)) {
/* process died before we got to it or somesuch */
/* struct nlmsgerr *err = NLMSG_DATA(&msg);
log ("fatal reply error, errno %d\n", err->error); */
return NULL;
}
rep_len = GENLMSG_PAYLOAD(&msg.n);
struct nlattr *na = (struct nlattr *) GENLMSG_DATA(&msg);
int len = 0;
while (len < rep_len) {
len += NLA_ALIGN(na->nla_len);
switch (na->nla_type) {
case TASKSTATS_TYPE_AGGR_PID: {
int aggr_len = NLA_PAYLOAD(na->nla_len);
int len2 = 0;
/* For nested attributes, na follows */
na = (struct nlattr *) NLA_DATA(na);
/* find the record we care about */
while (na->nla_type != TASKSTATS_TYPE_STATS) {
len2 += NLA_ALIGN(na->nla_len);
if (len2 >= aggr_len)
goto next_attr;
na = (struct nlattr *) ((char *) na + len2);
}
return (struct taskstats *) NLA_DATA(na);
}
}
next_attr:
na = (struct nlattr *) (GENLMSG_DATA(&msg) + len);
}
}
return NULL;
}
/*
* Unfortunately the TGID stuff doesn't work at all well
* in the kernel - we have to manually aggregate here.
*/
static struct taskstats *
get_taskstats (pid_t pid)
{
struct taskstats *ts;
/* set_pid */
int rc = send_cmd (netlink_socket, netlink_taskstats_id, 0,
TASKSTATS_CMD_GET, TASKSTATS_CMD_ATTR_PID,
&pid, sizeof(__u32));
if (rc < 0)
return NULL;
/* get reply */
ts = wait_taskstats ();
if (!ts)
return NULL;
if (ts->ac_pid != pid) {
log ("Serious error got data for wrong pid: %d %d\n",
(int)ts->ac_pid, (int)pid);
return NULL;
}
return ts;
}
/*
* Unfortunately the TGID stuff doesn't work at all well
* in the kernel - we have to manually aggregate here.
*/
static struct taskstats *
get_tgid_taskstats (PidScanner *scanner)
{
pid_t tpid;
struct taskstats *ts;
static struct taskstats tgits;
memset (&tgits, 0, sizeof (struct taskstats));
ts = get_taskstats (pid_scanner_get_cur_pid (scanner));
if (!ts)
return NULL;
tgits = *ts;
pid_scanner_get_tasks_start (scanner);
while ((tpid = pid_scanner_get_tasks_next (scanner))) {
struct taskstats *ts = get_taskstats (tpid);
if (!ts)
continue;
/* log ("CPU aggregate %d: %ld\n", tpid, (long) ts->cpu_run_real_total); */
/* aggregate */
tgits.cpu_run_real_total += ts->cpu_run_real_total;
tgits.swapin_delay_total += ts->swapin_delay_total;
tgits.blkio_delay_total += ts->blkio_delay_total;
}
pid_scanner_get_tasks_stop (scanner);
return &tgits;
}
/*
* Linux exports one set of quite good data in:
* /proc/./stat: linux/fs/proc/array.c (do_task_stat)
* and another high-res (but different) set of data in:
* linux/kernel/tsacct.c
* linux/kernel/delayacct.c - needs delay accounting enabled
*/
static void
dump_taskstat (BufferFile *file, PidScanner *scanner)
{
pid_t ppid;
int output_len;
char output_line[1024];
PidEntry *entry;
__u64 time_total;
struct taskstats *ts;
ts = get_tgid_taskstats (scanner);
if (!ts) /* process exited before we got there */
return;
/* reduce the amount of parsing we have to do later */
entry = get_pid_entry (ts->ac_pid);
time_total = (ts->cpu_run_real_total + ts->blkio_delay_total +
ts->swapin_delay_total);
if (entry->time_total == time_total && entry->ppid == ts->ac_ppid)
return;
entry->time_total = time_total;
entry->ppid = ts->ac_ppid;
/* we can get a much cleaner ppid from PROC_EVENTS */
ppid = pid_scanner_get_cur_ppid (scanner);
if (!ppid)
ppid = ts->ac_ppid;
/* NB. ensure we aggregate all fields we need in get_tgid_tasstats */
output_len = snprintf (output_line, 1024, "%d %d %s %lld %lld %lld\n",
ts->ac_pid, ppid, ts->ac_comm,
(long long)ts->cpu_run_real_total,
(long long)ts->blkio_delay_total,
(long long)ts->swapin_delay_total);
if (output_len < 0)
return;
buffer_file_append (file, output_line, output_len);
/* FIXME - can we get better stats on what is waiting for what ?
'blkio_count / blkio_delay_total' ... [etc.]
'delay waiting for CPU while runnable' ... [!] fun :-) */
/* The data we get from /proc is: */
/*
opid, cmd, state, ppid = float(tokens[0]), ' '.join(tokens[1:2+offset]), tokens[2+offset], int(tokens[3+offset])
userCpu, sysCpu, stime= int(tokens[13+offset]), int(tokens[14+offset]), int(tokens[21+offset]) */
/* op