/*
* numa.c
*
* numa: Simulate NUMA-sensitive workload and measure their NUMA performance
*/
#include "../perf.h"
#include "../builtin.h"
#include "../util/util.h"
#include "../util/parse-options.h"
#include "bench.h"
#include <errno.h>
#include <sched.h>
#include <stdio.h>
#include <assert.h>
#include <malloc.h>
#include <signal.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <pthread.h>
#include <sys/mman.h>
#include <sys/time.h>
#include <sys/wait.h>
#include <sys/prctl.h>
#include <sys/types.h>
#include <numa.h>
#include <numaif.h>
/*
* Regular printout to the terminal, supressed if -q is specified:
*/
#define tprintf(x...) do { if (g && g->p.show_details >= 0) printf(x); } while (0)
/*
* Debug printf:
*/
#define dprintf(x...) do { if (g && g->p.show_details >= 1) printf(x); } while (0)
struct thread_data {
int curr_cpu;
cpu_set_t bind_cpumask;
int bind_node;
u8 *process_data;
int process_nr;
int thread_nr;
int task_nr;
unsigned int loops_done;
u64 val;
u64 runtime_ns;
pthread_mutex_t *process_lock;
};
/* Parameters set by options: */
struct params {
/* Startup synchronization: */
bool serialize_startup;
/* Task hierarchy: */
int nr_proc;
int nr_threads;
/* Working set sizes: */
const char *mb_global_str;
const char *mb_proc_str;
const char *mb_proc_locked_str;
const char *mb_thread_str;
double mb_global;
double mb_proc;
double mb_proc_locked;
double mb_thread;
/* Access patterns to the working set: */
bool data_reads;
bool data_writes;
bool data_backwards;
bool data_zero_memset;
bool data_rand_walk;
u32 nr_loops;
u32 nr_secs;
u32 sleep_usecs;
/* Working set initialization: */
bool init_zero;
bool init_random;
bool init_cpu0;
/* Misc options: */
int show_details;
int run_all;
int thp;
long bytes_global;
long bytes_process;
long bytes_process_locked;
long bytes_thread;
int nr_tasks;
bool show_quiet;
bool show_convergence;
bool measure_convergence;
int perturb_secs;
int nr_cpus;
int nr_nodes;
/* Affinity options -C and -N: */
char *cpu_list_str;
char *node_list_str;
};
/* Global, read-writable area, accessible to all processes and threads: */
struct global_info {
u8 *data;
pthread_mutex_t startup_mutex;
int nr_tasks_started;
pthread_mutex_t startup_done_mutex;
pthread_mutex_t start_work_mutex;
int nr_tasks_working;
pthread_mutex_t stop_work_mutex;
u64 bytes_done;
struct thread_data *threads;
/* Convergence latency measurement: */
bool all_converged;
bool stop_work;
int print_once;
struct params p;
};
static struct global_info *g = NULL;
static int parse_cpus_opt(const struct option *opt, const char *arg, int unset);
static int parse_nodes_opt(const struct option *opt, const char *arg, int unset);
struct params p0;
static const struct option options[] = {
OPT_INTEGER('p', "nr_proc" , &p0.nr_proc, "number of processes"),
OPT_INTEGER('t', "nr_threads" , &p0.nr_threads, "number of threads per process"),
OPT_STRING('G', "mb_global" , &p0.mb_global_str, "MB", "global memory (MBs)"),
OPT_STRING('P', "mb_proc" , &p0.mb_proc_str, "MB", "process memory (MBs)"),
OPT_STRING('L', "mb_proc_locked", &p0.mb_proc_locked_str,"MB", "process serialized/locked memory access (MBs), <= process_memory"),
OPT_STRING('T', "mb_thread" , &p0.mb_thread_str, "MB", "thread memory (MBs)"),
OPT_UINTEGER('l', "nr_loops" , &p0.nr_loops, "max number of loops to run"),
OPT_UINTEGER('s', "nr_secs" , &p0.nr_secs, "max number of seconds to run"),
OPT_UINTEGER('u', "usleep" , &p0.sleep_usecs, "usecs to sleep per loop iteration"),
OPT_BOOLEAN('R', "data_reads" , &p0.data_reads, "access the data via writes (can be mixed with -W)"),
OPT_BOOLEAN('W', "data_writes" , &p0.data_writes, "access the data via writes (can be mixed with -R)"),
OPT_BOOLEAN('B', "data_backwards", &p0.data_backwards, "access the data backwards as well"),
OPT_BOOLEAN('Z', "data_zero_memset", &p0.data_zero_memset,"access the data via glibc bzero only"),
OPT_BOOLEAN('r', "data_rand_walk", &p0.data_rand_walk, "access the data with random (32bit LFSR) walk"),
OPT_BOOLEAN('z', "init_zero" , &p0.init_zero, "bzero the initial allocations"),
OPT_BOOLEAN('I', "init_random" , &p0.init_random, "randomize the contents of the initial allocations"),
OPT_BOOLEAN('0', "init_cpu0" , &p0.init_cpu0, "do the initial allocations on CPU#0"),
OPT_INTEGER('x', "perturb_secs", &p0.perturb_secs, "perturb thread 0/0 every X secs, to test convergence stability"),
OPT_INCR ('d', "show_details" , &p0.show_details, "Show details"),
OPT_INCR ('a', "all" , &p0.run_all, "Run all tests in the suite"),
OPT_INTEGER('H', "thp" , &p0.thp, "MADV_NOHUGEPAGE < 0 < MADV_HUGEPAGE"),
OPT_BOOLEAN('c', "show_convergence", &p0.show_convergence, "show convergence details"),
OPT_BOOLEAN('m', "measure_convergence", &p0.measure_convergence, "measure convergence latency"),
OPT_BOOLEAN('q', "quiet" , &p0.show_quiet, "bzero the initial allocations"),
OPT_BOOLEAN('S', "serialize-startup", &p0.serialize_startup,"serialize thread startup"),
/* Special option string parsing callbacks: */
OPT_CALLBACK('C', "cpus", NULL, "cpu[,cpu2,...cpuN]",
"bind the first N tasks to these specific cpus (the rest is unbound)",
parse_cpus_opt),
OPT_CALLBACK('M', "memnodes", NULL, "node[,node2,...nodeN]",
"bind the first N tasks to these specific memory nodes (the rest is unbound)",
parse_nodes_opt),
OPT_END()
};
static const char * const bench_numa_usage[] = {
"perf bench numa <options>",
NULL
};
static const char * const numa_usage[] = {
"perf bench numa mem [<options>]",
NULL
};
static cpu_set_t bind_to_cpu(int target_cpu)
{
cpu_set_t orig_mask, mask;
int ret;
ret = sched_getaffinity(0, sizeof(orig_mask), &orig_mask);
BUG_ON(ret);
CPU_ZERO(&mask);
if (target_cpu == -1) {
int cpu;
for (cpu = 0; cpu < g->p.nr_cpus; cpu++)
CPU_SET(cpu, &mask);
} else {
BUG_ON(target_cpu < 0 || target_cpu >= g->p.nr_cpus);
CPU_SET(target_cpu, &mask);
}
ret = sched_setaffinity(0, sizeof(mask), &mask);
BUG_ON(ret);
return orig_mask;
}
static cpu_set_t bind_to_node(int target_node)
{
int cpus_per_node = g->p.nr_cpus/g->p.nr_nodes;
cpu_set_t orig_mask, mask;
int cpu;
int ret;
BUG_ON(cpus_per_node*g->p.nr_nodes != g->p.nr_cpus);
BUG_ON(!cpus_per_node);
ret = sched_getaffinity(0, sizeof(orig_mask), &orig_mask);
BUG_ON(ret);
CPU_ZERO(&mask);
if (target_node == -1) {
for (cpu = 0; cpu < g->p.nr_cpus; cpu++)
CPU_SET(cpu, &mask);
} else {
int cpu_start = (target_node + 0) * cpus_per_node;
int cpu_stop = (target_node + 1) * cpus_per_node;
BUG_ON(cpu_stop > g->p.nr_cpus);
for (cpu = cpu_start; cpu < cpu_stop; cpu++)
CPU_SET(cpu, &mask);
}
ret = sched_setaffinity(0, sizeof(mask), &mask);
BUG_ON(ret);
return orig_mask;
}
static void bind_to_cpumask(cpu_set_t mask)
{
int ret;
ret = sched_setaffinity(0, sizeof(mask), &mask);
BUG_ON(ret);
}
static void mempol_restore(void)
{
int ret;
ret = set_mempolicy(MPOL_DEFAULT, NULL, g->p.nr_nodes-1);
BUG_ON(ret);
}
static void bind_to_memnode(int node)
{
unsigned long nodemask;
int ret;
if (node == -1)
return;
BUG_ON(g->p.nr_nodes > (int)sizeof(nodemask));
nodemask = 1L << node;
ret = set_mempolicy(MPOL_BIND, &nodemask, sizeof(nodemask)*8);
dprintf("binding to node %d, mask: %016lx => %d\n", node, nodemask, ret);
BUG_ON(ret);
}
#define HPSIZE (2*1024*1024)
#define set_taskname(fmt...) \
do { \
char name[20]; \
\
snprintf(name, 20, fmt); \
prctl(PR_SET_NAME, name); \
} while (0)
static u8 *alloc_data(ssize_t bytes0, int map_flags,
int init_zero, int init_cpu0, int thp, int init_random)
{
cpu_set_t orig_mask;
ssize_t bytes;
u8 *buf;
int ret;
if (!bytes0)
return NULL;
/* Allocate and initialize all memory on CPU#0: */
if (init_cp
numa.rar_NUMA
版权申诉
159 浏览量
2022-09-14
22:20:54
上传
评论
收藏 11KB RAR 举报
局外狗
- 粉丝: 64
- 资源: 1万+
最新资源
- U8Cloud常见开发的示例
- AT32与Sxx32/Gx32替换对照表
- STM32VET6单片机+XC3S250E(FPGA)+AD9708 DDS信号发生器MCU+FPGA源码+PDF硬件设计原理图
- SCIPOptSuite-9.0.0-win64-VS15.exe
- GIS10.4补丁文件
- fdgsfdgfdsgsf
- 基于CodeMirror5实现的mybatis+sql提示扩展功能,主要是用于在线设计开发场景中的动态sql部分.zip
- 计算机毕业设计-aSP.NET某中学学生成绩管理系统的设计(源代码+)-毕设源码实例.zip
- springboot集成mybatis动态sql.zip
- mybatis中的动态sql, 涉及 where trim set if foreach等
资源上传下载、课程学习等过程中有任何疑问或建议,欢迎提出宝贵意见哦~我们会及时处理!
点击此处反馈