#include "gemm.h"
#include "utils.h"
#include "im2col.h"
#include "dark_cuda.h"
#include <stdlib.h>
#include <stdio.h>
#include <math.h>
#include <float.h>
#include <string.h>
#include <stdint.h>
#ifdef _WIN32
#include <intrin.h>
#endif
#if defined(_OPENMP)
#include <omp.h>
#endif
#define TILE_M 4 // 4 ops
#define TILE_N 16 // AVX2 = 2 ops * 8 floats
#define TILE_K 16 // loop
#ifdef __cplusplus
#define PUT_IN_REGISTER
#else
#define PUT_IN_REGISTER register
#endif
void gemm_bin(int M, int N, int K, float ALPHA,
char *A, int lda,
float *B, int ldb,
float *C, int ldc)
{
int i,j,k;
for(i = 0; i < M; ++i){
for(k = 0; k < K; ++k){
char A_PART = A[i*lda+k];
if(A_PART){
for(j = 0; j < N; ++j){
C[i*ldc+j] += B[k*ldb+j];
}
} else {
for(j = 0; j < N; ++j){
C[i*ldc+j] -= B[k*ldb+j];
}
}
}
}
}
float *random_matrix(int rows, int cols)
{
int i;
float* m = (float*)xcalloc(rows * cols, sizeof(float));
for(i = 0; i < rows*cols; ++i){
m[i] = (float)rand()/RAND_MAX;
}
return m;
}
void time_random_matrix(int TA, int TB, int m, int k, int n)
{
float *a;
if(!TA) a = random_matrix(m,k);
else a = random_matrix(k,m);
int lda = (!TA)?k:m;
float *b;
if(!TB) b = random_matrix(k,n);
else b = random_matrix(n,k);
int ldb = (!TB)?n:k;
float *c = random_matrix(m,n);
int i;
clock_t start = clock(), end;
for(i = 0; i<10; ++i){
gemm_cpu(TA,TB,m,n,k,1,a,lda,b,ldb,1,c,n);
}
end = clock();
printf("Matrix Multiplication %dx%d * %dx%d, TA=%d, TB=%d: %lf ms\n",m,k,k,n, TA, TB, (float)(end-start)/CLOCKS_PER_SEC);
free(a);
free(b);
free(c);
}
void gemm(int TA, int TB, int M, int N, int K, float ALPHA,
float *A, int lda,
float *B, int ldb,
float BETA,
float *C, int ldc)
{
gemm_cpu( TA, TB, M, N, K, ALPHA,A,lda, B, ldb,BETA,C,ldc);
}
//--------------------------------------------
// XNOR bitwise GEMM for binary neural network
//--------------------------------------------
static inline unsigned char xnor(unsigned char a, unsigned char b) {
//return a == b;
return !(a^b);
}
// INT-32
static inline uint32_t get_bit_int32(uint32_t const*const src, size_t index) {
size_t src_i = index / 32;
int src_shift = index % 32;
unsigned char val = (src[src_i] & (1 << src_shift)) > 0;
return val;
}
static inline uint32_t xnor_int32(uint32_t a, uint32_t b) {
return ~(a^b);
}
static inline uint64_t xnor_int64(uint64_t a, uint64_t b) {
return ~(a^b);
}
static inline uint32_t fill_bit_int32(char src) {
if (src == 0) return 0x00000000;
else return 0xFFFFFFFF;
}
static inline uint64_t fill_bit_int64(char src) {
if (src == 0) return 0x0000000000000000;
else return 0xFFFFFFFFFFFFFFFF;
}
void binary_int32_printf(uint32_t src) {
int i;
for (i = 0; i < 32; ++i) {
if (src & 1) printf("1");
else printf("0");
src = src >> 1;
}
printf("\n");
}
void binary_int64_printf(uint64_t src) {
int i;
for (i = 0; i < 64; ++i) {
if (src & 1) printf("1");
else printf("0");
src = src >> 1;
}
printf("\n");
}
/*
void gemm_nn_custom_bin_mean(int M, int N, int K, float ALPHA_UNUSED,
unsigned char *A, int lda,
unsigned char *B, int ldb,
float *C, int ldc, float *mean_arr)
{
int *count_arr = xcalloc(M*N, sizeof(int));
int i, j, k;
for (i = 0; i < M; ++i) { // l.n - filters [16 - 55 - 1024]
for (k = 0; k < K; ++k) { // l.size*l.size*l.c - one filter size [27 - 9216]
char a_bit = get_bit(A, i*lda + k);
for (j = 0; j < N; ++j) { // out_h*out_w - one channel output size [169 - 173056]
char b_bit = get_bit(B, k*ldb + j);
count_arr[i*ldc + j] += xnor(a_bit, b_bit);
}
}
}
for (i = 0; i < M; ++i) {
float mean_val = mean_arr[i];
for (j = 0; j < N; ++j) {
C[i*ldc + j] = (2 * count_arr[i*ldc + j] - K) * mean_val;
}
}
free(count_arr);
}
*/
/*
void gemm_nn_custom_bin_mean_transposed(int M, int N, int K, float ALPHA_UNUSED,
unsigned char *A, int lda,
unsigned char *B, int ldb,
float *C, int ldc, float *mean_arr)
{
int *count_arr = xcalloc(M*N, sizeof(int));
int i, j, k;
for (i = 0; i < M; ++i) { // l.n - filters [16 - 55 - 1024]
for (j = 0; j < N; ++j) { // out_h*out_w - one channel output size [169 - 173056]
for (k = 0; k < K; ++k) { // l.size*l.size*l.c - one filter size [27 - 9216]
char a_bit = get_bit(A, i*lda + k);
char b_bit = get_bit(B, j*ldb + k);
count_arr[i*ldc + j] += xnor(a_bit, b_bit);
}
}
}
for (i = 0; i < M; ++i) {
float mean_val = mean_arr[i];
for (j = 0; j < N; ++j) {
C[i*ldc + j] = (2 * count_arr[i*ldc + j] - K) * mean_val;
}
}
free(count_arr);
}
*/
/*
void gemm_nn_custom_bin_mean(int M, int N, int K, float ALPHA_UNUSED,
unsigned char *A, int lda,
unsigned char *B, int ldb,
float *C, int ldc, float *mean_arr)
{
int *count_arr = xcalloc(M*N, sizeof(int));
int i;
#pragma omp parallel for
for (i = 0; i < M; ++i) { // l.n - filters [16 - 55 - 1024]
int j, k, h;
for (k = 0; k < K; ++k) { // l.size*l.size*l.c - one filter size [27 - 9216]
const char a_bit = get_bit(A, i*lda + k);
uint64_t a_bit64 = fill_bit_int64(a_bit);
int k_ldb = k*ldb;
for (j = 0; j < N; j += 64) { // out_h*out_w - one channel output size [169 - 173056]
if ((N - j > 64) && (k_ldb % 8 == 0)) {
uint64_t b_bit64 = *((uint64_t *)(B + (k_ldb + j) / 8));
uint64_t c_bit64 = xnor_int64(a_bit64, b_bit64);
//printf("\n %d \n",__builtin_popcountll(c_bit64)); // gcc
printf("\n %d \n", __popcnt64(c_bit64)); // msvs
int h;
for (h = 0; h < 64; ++h)
if ((c_bit64 >> h) & 1) count_arr[i*ldc + j + h] += 1;
//binary_int64_printf(a_bit64);
//binary_int64_printf(b_bit64);
//binary_int64_printf(c_bit64);
}
else {
for (; j < N; ++j) { // out_h*out_w - one channel output size [169 - 173056]
char b_bit = get_bit(B, k_ldb + j);
if (xnor(a_bit, b_bit)) count_arr[i*ldc + j] += 1;
}
}
}
}
}
if (mean_arr) {
//int K_2 = K / 2;
for (i = 0; i < M; ++i) {
float mean_val = mean_arr[i];
//float mean_val2 = 2 * mean_val;
for (j = 0; j < N; ++j) {
C[i*ldc + j] = (2 * count_arr[i*ldc + j] - K) * mean_val;
//C[i*ldc + j] = (count_arr[i*ldc + j] - K_2) *mean_val2;
}
}
}
else {
for (i = 0; i < M; ++i) {
for (j = 0; j < N; ++j) {
C[i*ldc + j] = count_arr[i*ldc + j] - K / 2;
}
}
}
free(count_arr);
//getchar();
}
*/
/*
void gemm_nn_custom_bin_mean_transposed(int M, int N, int K, float ALPHA_UNUSED,
unsigned char *A, int lda,
unsigned char *B, int ldb,
float *C, int ldc, float *mean_arr)
{
int i;
#pragma omp parallel for
for (i = 0; i < M; ++i) { // l.n - filters [16 - 55 - 1024]
int j, k, h;
float mean_val = mean_arr[i];
for (j = 0; j < N; ++j) { // out_h*out_w - one channel output size [169 - 173056]
int count = 0;
for (k = 0; k < K; k += 64) { // l.size*l.size*l.c
没有合适的资源?快使用搜索试试~ 我知道了~
温馨提示
基于darknet框架yolov3模型的交通管理系统c和c++源码+注释拉满(带GUI界面、人流量车流量统计).zip 【项目介绍】 功能 完成 人流量统计 车流量统计 未完成 占用公交车道 闯红绿灯 车牌失败 斑马线不礼让行人 功能 完成 人流量统计 车流量统计 未完成 占用公交车道 闯红绿灯 车牌失败 斑马线不礼让行人
资源推荐
资源详情
资源评论













收起资源包目录





































































































共 176 条
- 1
- 2
资源评论


zui初的梦想
- 粉丝: 822
- 资源: 1754
上传资源 快速赚钱
我的内容管理 展开
我的资源 快来上传第一个资源
我的收益
登录查看自己的收益我的积分 登录查看自己的积分
我的C币 登录后查看C币余额
我的收藏
我的下载
下载帮助


安全验证
文档复制为VIP权益,开通VIP直接复制
