#include <stdio.h>
#include <stdlib.h>
#include <getopt.h>
#include <unistd.h>
#include <string.h>
#include "/usr/local/Ascend/driver/include/dsmi_common_interface.h"
#define ERROR_CODE_MAX_NUM 128
#define BUFF_SIZE 256
struct dsmi_aicpu_info_stru aicpu_info;
struct dsmi_aicore_info_stru pdevice_aicore_info={0};
struct HuaWeiAICardInfo{
int device_count; //在线的AI卡数量
int temperature; //AI卡的实时温度,单位摄氏度
unsigned int utilization_rate; //内存占用率,比如输出30,那就是占用率为30%
unsigned int aicore_cur_freq; //AI CORE的当前频率,单位MHz
unsigned int aicore_max_freq; //AI CORE的最大频率,单位MHz
unsigned int aicore_utilizationrate; //AI CORE的占用率,比如输出30,那就是占用率为30%
unsigned int aicpu_cur_freq; //AI CPU的当前频率,单位MHz
unsigned int aicpu_max_freq; //AI CPU的最大频率,单位MHz
unsigned int aicpu_utilizationrate; //AI CPU的占用率,比如输出30,那就是占用率为30%
unsigned int hostcpu_utilizationrate; //host CPU的占用率,比如输出30,那就是占用率为30%
unsigned int bandwidth; //实时带宽,比如输出30,那就是带宽为30%
};
#include <QApplication>
int getHuaWeiAICardInfo(HuaWeiAICardInfo &aiinfo)
{
int ret = 0;
unsigned int health = 0;
//检测在线的AI卡数量
int device_count;
ret = dsmi_get_device_count(&device_count);
if(ret != 0) {
printf("Error: Could not run dsmi_get_device_count --- Hua Wei AI card !\n");
return ret;
}
//返回AI卡的数量
if (device_count == 0) {
printf("Error: Could not find any Hua Wei AI card !\n");
return -2;
}
if(device_count!=1)
{
printf("Error: Could not find any Hua Wei AI card !\n");
return -2;
}
printf("Note:Hua Wei AI card ---find Hua Wei AI card : %d !\n",device_count);
aiinfo.device_count=device_count;
//获得每张AI卡的ID
int device_list[64] = {0};
ret = dsmi_list_device(device_list, device_count);
if(ret != 0) {
printf("Error: Could not run dsmi_list_device --- Hua Wei AI card !\n");
return ret;
}
int errorcount = 0;
unsigned char perrorinfo[BUFF_SIZE] = {0};
unsigned int perrorcode [ERROR_CODE_MAX_NUM] = {0};
//对每张AI卡获取要观察的指标
for (int dev_id = 0; dev_id < device_count; dev_id++)
{
//AI卡的健康状态
ret = dsmi_get_device_health(device_list[dev_id], &health);
if (ret != 0) {
printf("Error: Could not run dsmi_get_device_health --- Hua Wei AI card !\n");
return ret;
}
if (health != 0)
{
//如果AI卡不健康,返回不健康码
ret = dsmi_get_device_errorcode(0, &errorcount, perrorcode);
if(ret != 0 || (errorcount == 0)) {
printf("Error: Could not run dsmi_get_device_errorcode --- Hua Wei AI card !\n");
return ret;
}
for (int erri = 0; erri < errorcount; erri++)
{
ret = dsmi_query_errorstring(0, perrorcode[erri], perrorinfo, BUFF_SIZE);
if(ret != 0) {
printf("Error: Hua Wei AI card : errorcode: %d !\n",perrorcode[erri]);
//printf(perrorinfo);
continue;
}
}
return -3;
}
//获取实时温度
int temper = 0;
ret = dsmi_get_device_temperature(device_list[dev_id], &temper);
if(ret != 0) {
printf("Error: Could not run dsmi_get_device_temperature --- Hua Wei AI card !\n");
return ret;
}
printf("Note:Hua Wei AI card ---temperature: %d !\n",temper);
aiinfo.temperature=temper;
//获取内存占用率
unsigned int putilization_rate;
ret = dsmi_get_device_utilization_rate(dev_id, 1, &putilization_rate);
if(ret != 0) {
printf("Error: Could not run dsmi_get_device_utilization_rate1 --- Hua Wei AI card !\n");
return ret;
}
printf("Note:Hua Wei AI card ---memory utilization rate: %d % !\n",putilization_rate);
aiinfo.utilization_rate=putilization_rate;
//AI CORE的当前频率/最大频率
ret = dsmi_get_aicore_info(device_list[dev_id], &pdevice_aicore_info);
if(ret != 0) {
printf("Error: Could not run dsmi_get_aicore_info --- Hua Wei AI card !\n");
return ret;
}
printf("Note:Hua Wei AI card ---AI Core frequency: %d MHz/%d MHz!\n",pdevice_aicore_info.curfreq,pdevice_aicore_info.freq);
aiinfo.aicore_cur_freq=pdevice_aicore_info.curfreq;
aiinfo.aicore_max_freq=pdevice_aicore_info.freq;
//AI CORE的占用率,比如输出是25,那表示占用率为25%
ret = dsmi_get_device_utilization_rate(dev_id, 2, &putilization_rate);
if(ret != 0) {
printf("Error: Could not run dsmi_get_device_utilization_rate2 --- Hua Wei AI card !\n");
return ret;
}
printf("Note:Hua Wei AI card ---AI core utilization rate: %d % !\n",putilization_rate);
aiinfo.aicore_utilizationrate=putilization_rate;
//AI CPU的当前频率/最大频率
ret = dsmi_get_aicpu_info(dev_id, &aicpu_info);
if (ret) {
printf("Error: Could not run dsmi_get_aicpu_info --- Hua Wei AI card !\n");
return ret;
}
printf("Note:Hua Wei AI card ---AI CPU frequency: %d MHz/%d MHz!\n",aicpu_info.curFreq,aicpu_info.maxFreq);
aiinfo.aicpu_cur_freq=aicpu_info.curFreq;
aiinfo.aicpu_max_freq=aicpu_info.maxFreq;
//AI CPU的占用率
ret = dsmi_get_device_utilization_rate(dev_id, 3, &putilization_rate);
if(ret != 0) {
printf("Error: Could not run dsmi_get_device_utilization_rate3 --- Hua Wei AI card !\n");
return ret;
}
printf("Note:Hua Wei AI card ---AI cpu utilization rate: %d % !\n",putilization_rate);
aiinfo.aicpu_utilizationrate=putilization_rate;
//host CPU的占用率
ret = dsmi_get_device_utilization_rate(dev_id, 4, &putilization_rate);
if(ret != 0) {
printf("Error: Could not run dsmi_get_device_utilization_rate4 --- Hua Wei AI card !\n");
return ret;
}
printf("Note:Hua Wei AI card ---host cpu utilization rate: %d % !\n",putilization_rate);
aiinfo.hostcpu_utilizationrate=putilization_rate;
//实时带宽
ret = dsmi_get_device_utilization_rate(dev_id, 5, &putilization_rate);
if(ret != 0) {
printf("Error: Could not run dsmi_get_device_utilization_rate5 --- Hua Wei AI card !\n");
return ret;
}
printf("Note:Hua Wei AI card ---bandwidth: %d % !\n",putilization_rate);
aiinfo.bandwidth=putilization_rate;
}
return 0;
}
int main(int argc, char *argv[])
{
// QApplication a(argc, argv);
// Widget w;
// w.show();
// return a.exec();
HuaWeiAICardInfo theaiinfo;
memset(&theaiinfo,0,sizeof(theaiinfo));
int flag=getHuaWeiAICardInfo(theaiinfo);
return 0;
}