#include "gemm.h"
#include "utils.h"
#include "im2col.h"
#include "dark_cuda.h"
#include <stdlib.h>
#include <stdio.h>
#include <math.h>
#include <float.h>
#include <string.h>
#include <stdint.h>
#ifdef _WIN32
#include <intrin.h>
#endif
#if defined(_OPENMP)
#include <omp.h>
#endif
#define TILE_M 4 // 4 ops
#define TILE_N 16 // AVX2 = 2 ops * 8 floats
#define TILE_K 16 // loop
#ifdef __cplusplus
#define PUT_IN_REGISTER
#else
#define PUT_IN_REGISTER register
#endif
void gemm_bin(int M, int N, int K, float ALPHA,
char *A, int lda,
float *B, int ldb,
float *C, int ldc)
{
int i,j,k;
for(i = 0; i < M; ++i){
for(k = 0; k < K; ++k){
char A_PART = A[i*lda+k];
if(A_PART){
for(j = 0; j < N; ++j){
C[i*ldc+j] += B[k*ldb+j];
}
} else {
for(j = 0; j < N; ++j){
C[i*ldc+j] -= B[k*ldb+j];
}
}
}
}
}
float *random_matrix(int rows, int cols)
{
int i;
float* m = (float*)xcalloc(rows * cols, sizeof(float));
for(i = 0; i < rows*cols; ++i){
m[i] = (float)rand()/RAND_MAX;
}
return m;
}
void time_random_matrix(int TA, int TB, int m, int k, int n)
{
float *a;
if(!TA) a = random_matrix(m,k);
else a = random_matrix(k,m);
int lda = (!TA)?k:m;
float *b;
if(!TB) b = random_matrix(k,n);
else b = random_matrix(n,k);
int ldb = (!TB)?n:k;
float *c = random_matrix(m,n);
int i;
clock_t start = clock(), end;
for(i = 0; i<10; ++i){
gemm_cpu(TA,TB,m,n,k,1,a,lda,b,ldb,1,c,n);
}
end = clock();
printf("Matrix Multiplication %dx%d * %dx%d, TA=%d, TB=%d: %lf ms\n",m,k,k,n, TA, TB, (float)(end-start)/CLOCKS_PER_SEC);
free(a);
free(b);
free(c);
}
void gemm(int TA, int TB, int M, int N, int K, float ALPHA,
float *A, int lda,
float *B, int ldb,
float BETA,
float *C, int ldc)
{
gemm_cpu( TA, TB, M, N, K, ALPHA,A,lda, B, ldb,BETA,C,ldc);
}
//--------------------------------------------
// XNOR bitwise GEMM for binary neural network
//--------------------------------------------
static inline unsigned char xnor(unsigned char a, unsigned char b) {
//return a == b;
return !(a^b);
}
// INT-32
static inline uint32_t get_bit_int32(uint32_t const*const src, size_t index) {
size_t src_i = index / 32;
int src_shift = index % 32;
unsigned char val = (src[src_i] & (1 << src_shift)) > 0;
return val;
}
static inline uint32_t xnor_int32(uint32_t a, uint32_t b) {
return ~(a^b);
}
static inline uint64_t xnor_int64(uint64_t a, uint64_t b) {
return ~(a^b);
}
static inline uint32_t fill_bit_int32(char src) {
if (src == 0) return 0x00000000;
else return 0xFFFFFFFF;
}
static inline uint64_t fill_bit_int64(char src) {
if (src == 0) return 0x0000000000000000;
else return 0xFFFFFFFFFFFFFFFF;
}
void binary_int32_printf(uint32_t src) {
int i;
for (i = 0; i < 32; ++i) {
if (src & 1) printf("1");
else printf("0");
src = src >> 1;
}
printf("\n");
}
void binary_int64_printf(uint64_t src) {
int i;
for (i = 0; i < 64; ++i) {
if (src & 1) printf("1");
else printf("0");
src = src >> 1;
}
printf("\n");
}
/*
void gemm_nn_custom_bin_mean(int M, int N, int K, float ALPHA_UNUSED,
unsigned char *A, int lda,
unsigned char *B, int ldb,
float *C, int ldc, float *mean_arr)
{
int *count_arr = xcalloc(M*N, sizeof(int));
int i, j, k;
for (i = 0; i < M; ++i) { // l.n - filters [16 - 55 - 1024]
for (k = 0; k < K; ++k) { // l.size*l.size*l.c - one filter size [27 - 9216]
char a_bit = get_bit(A, i*lda + k);
for (j = 0; j < N; ++j) { // out_h*out_w - one channel output size [169 - 173056]
char b_bit = get_bit(B, k*ldb + j);
count_arr[i*ldc + j] += xnor(a_bit, b_bit);
}
}
}
for (i = 0; i < M; ++i) {
float mean_val = mean_arr[i];
for (j = 0; j < N; ++j) {
C[i*ldc + j] = (2 * count_arr[i*ldc + j] - K) * mean_val;
}
}
free(count_arr);
}
*/
/*
void gemm_nn_custom_bin_mean_transposed(int M, int N, int K, float ALPHA_UNUSED,
unsigned char *A, int lda,
unsigned char *B, int ldb,
float *C, int ldc, float *mean_arr)
{
int *count_arr = xcalloc(M*N, sizeof(int));
int i, j, k;
for (i = 0; i < M; ++i) { // l.n - filters [16 - 55 - 1024]
for (j = 0; j < N; ++j) { // out_h*out_w - one channel output size [169 - 173056]
for (k = 0; k < K; ++k) { // l.size*l.size*l.c - one filter size [27 - 9216]
char a_bit = get_bit(A, i*lda + k);
char b_bit = get_bit(B, j*ldb + k);
count_arr[i*ldc + j] += xnor(a_bit, b_bit);
}
}
}
for (i = 0; i < M; ++i) {
float mean_val = mean_arr[i];
for (j = 0; j < N; ++j) {
C[i*ldc + j] = (2 * count_arr[i*ldc + j] - K) * mean_val;
}
}
free(count_arr);
}
*/
/*
void gemm_nn_custom_bin_mean(int M, int N, int K, float ALPHA_UNUSED,
unsigned char *A, int lda,
unsigned char *B, int ldb,
float *C, int ldc, float *mean_arr)
{
int *count_arr = xcalloc(M*N, sizeof(int));
int i;
#pragma omp parallel for
for (i = 0; i < M; ++i) { // l.n - filters [16 - 55 - 1024]
int j, k, h;
for (k = 0; k < K; ++k) { // l.size*l.size*l.c - one filter size [27 - 9216]
const char a_bit = get_bit(A, i*lda + k);
uint64_t a_bit64 = fill_bit_int64(a_bit);
int k_ldb = k*ldb;
for (j = 0; j < N; j += 64) { // out_h*out_w - one channel output size [169 - 173056]
if ((N - j > 64) && (k_ldb % 8 == 0)) {
uint64_t b_bit64 = *((uint64_t *)(B + (k_ldb + j) / 8));
uint64_t c_bit64 = xnor_int64(a_bit64, b_bit64);
//printf("\n %d \n",__builtin_popcountll(c_bit64)); // gcc
printf("\n %d \n", __popcnt64(c_bit64)); // msvs
int h;
for (h = 0; h < 64; ++h)
if ((c_bit64 >> h) & 1) count_arr[i*ldc + j + h] += 1;
//binary_int64_printf(a_bit64);
//binary_int64_printf(b_bit64);
//binary_int64_printf(c_bit64);
}
else {
for (; j < N; ++j) { // out_h*out_w - one channel output size [169 - 173056]
char b_bit = get_bit(B, k_ldb + j);
if (xnor(a_bit, b_bit)) count_arr[i*ldc + j] += 1;
}
}
}
}
}
if (mean_arr) {
//int K_2 = K / 2;
for (i = 0; i < M; ++i) {
float mean_val = mean_arr[i];
//float mean_val2 = 2 * mean_val;
for (j = 0; j < N; ++j) {
C[i*ldc + j] = (2 * count_arr[i*ldc + j] - K) * mean_val;
//C[i*ldc + j] = (count_arr[i*ldc + j] - K_2) *mean_val2;
}
}
}
else {
for (i = 0; i < M; ++i) {
for (j = 0; j < N; ++j) {
C[i*ldc + j] = count_arr[i*ldc + j] - K / 2;
}
}
}
free(count_arr);
//getchar();
}
*/
/*
void gemm_nn_custom_bin_mean_transposed(int M, int N, int K, float ALPHA_UNUSED,
unsigned char *A, int lda,
unsigned char *B, int ldb,
float *C, int ldc, float *mean_arr)
{
int i;
#pragma omp parallel for
for (i = 0; i < M; ++i) { // l.n - filters [16 - 55 - 1024]
int j, k, h;
float mean_val = mean_arr[i];
for (j = 0; j < N; ++j) { // out_h*out_w - one channel output size [169 - 173056]
int count = 0;
for (k = 0; k < K; k += 64) { // l.size*l.size*l.c
没有合适的资源?快使用搜索试试~ 我知道了~
温馨提示
基于darknet框架yolov3模型的交通管理系统c和c++源码+注释拉满(带GUI界面、人流量车流量统计).zip 【项目介绍】 功能 完成 人流量统计 车流量统计 未完成 占用公交车道 闯红绿灯 车牌失败 斑马线不礼让行人 功能 完成 人流量统计 车流量统计 未完成 占用公交车道 闯红绿灯 车牌失败 斑马线不礼让行人
资源推荐
资源详情
资源评论
收起资源包目录
基于darknet框架yolov3模型的交通管理系统c和c++源码+注释拉满(带GUI界面、人流量车流量统计).zip (176个子文件)
gemm.c 102KB
parser.c 76KB
data.c 64KB
detector.c 64KB
convolutional_layer.c 59KB
conv_lstm_layer.c 47KB
image.c 44KB
classifier.c 43KB
network.c 41KB
gaussian_yolo_layer.c 34KB
yolo_layer.c 31KB
box.c 29KB
go.c 26KB
lstm_layer.c 25KB
region_layer.c 22KB
utils.c 22KB
darknet.c 19KB
dark_cuda.c 17KB
blas.c 16KB
batchnorm_layer.c 15KB
gru_layer.c 15KB
coco.c 14KB
connected_layer.c 14KB
maxpool_layer.c 14KB
rnn.c 14KB
crnn_layer.c 14KB
getopt.c 13KB
shortcut_layer.c 12KB
yolo.c 12KB
layer.c 12KB
demo.c 12KB
detection_layer.c 12KB
captcha.c 11KB
compare.c 11KB
activations.c 11KB
rnn_layer.c 10KB
nightmare.c 9KB
local_layer.c 9KB
cifar.c 9KB
matrix.c 8KB
rnn_vid.c 7KB
deconvolutional_layer.c 6KB
normalization_layer.c 6KB
route_layer.c 5KB
scale_channels_layer.c 5KB
voxel.c 5KB
tag.c 4KB
writing.c 4KB
cost_layer.c 4KB
softmax_layer.c 4KB
im2col.c 4KB
super.c 4KB
tree.c 4KB
dice.c 4KB
col2im.c 4KB
sam_layer.c 3KB
option_list.c 3KB
upsample_layer.c 3KB
reorg_layer.c 3KB
reorg_old_layer.c 3KB
dropout_layer.c 3KB
crop_layer.c 3KB
swag.c 2KB
cpu_gemm.c 2KB
avgpool_layer.c 2KB
list.c 2KB
activation_layer.c 2KB
art.c 2KB
gettimeofday.c 1KB
imgui.cpp 491KB
imgui_widgets.cpp 329KB
imgui_demo.cpp 236KB
imgui_draw.cpp 163KB
image_opencv.cpp 44KB
common.cpp 34KB
yolo_console_dll.cpp 29KB
gui.cpp 24KB
main.cpp 23KB
http_stream.cpp 22KB
video.cpp 16KB
imgui_impl_win32.cpp 15KB
imgui_impl_dx9.cpp 13KB
yolo_v2_class.cpp 12KB
recognition.cpp 4KB
car_id.cpp 4KB
detect.cpp 4KB
help.cpp 2KB
main.cpp 464B
struct.cpp 21B
im2col_kernels.cu 84KB
blas_kernels.cu 82KB
convolutional_kernels.cu 57KB
network_kernels.cu 22KB
activation_kernels.cu 21KB
maxpool_layer_kernels.cu 11KB
dropout_layer_kernels.cu 9KB
crop_layer_kernels.cu 7KB
col2im_kernels.cu 6KB
deconvolutional_kernels.cu 4KB
avgpool_layer_kernels.cu 2KB
共 176 条
- 1
- 2
资源评论
z同学的编程之路
- 粉丝: 2124
- 资源: 2131
上传资源 快速赚钱
- 我的内容管理 展开
- 我的资源 快来上传第一个资源
- 我的收益 登录查看自己的收益
- 我的积分 登录查看自己的积分
- 我的C币 登录后查看C币余额
- 我的收藏
- 我的下载
- 下载帮助
安全验证
文档复制为VIP权益,开通VIP直接复制
信息提交成功