#include <cuda_runtime.h>
#include <cusparse.h>
#include <cublas_v2.h>
#define BLOCK_SIZE_XX 512
void adjnull_lk(bool adj,bool add,int size_mod,int size_data,float *mod_d,float *data_d)
{
if(add) return;
if(adj)
{
cudaMemset(mod_d,0,size_mod*sizeof(float));
}
else
{
cudaMemset(data_d,0,size_data*sizeof(float));
}
}
__global__ void MatrixMulKernel(bool adj,float *A_d,float *x_d,float *Y_d,int nx,int ny)
{
int tx = blockIdx.x*blockDim.x+threadIdx.x;
int ty = blockIdx.y*blockDim.y+threadIdx.y;
float Melement=0.0,Nelement=0.0;
float Pvalue = 0;
if(tx<nx&&ty<ny)
{
if(adj)
{
for (int q = 0; q < ny; ++q)
{
Melement = A_d[q * nx + tx];
Nelement = Y_d[q];
Pvalue += Melement * Nelement;
}
x_d[tx] = Pvalue;
}
else
{
for (int q = 0; q < nx; ++q)
{
Melement = A_d[ty * nx + q];
Nelement = x_d[q];
Pvalue += Melement * Nelement;
}
Y_d[ty] = Pvalue;
}
}
}
void sf_cgstep_gpu_lk_ori(
bool first,
int size_mod, int size_data /* model size, data size */,
float* mod_d /* current model [nx] */,
float* g_d /* gradient [nx] */,
float* rr_d /* data residual [ny] */,
float* gg_d /* conjugate gradient [ny] */,
float* Ss_d, /* residual step */
float* S_d, /* model step */
int iter,
float dpg0,
float dpr0)
/*< Step of conjugate-gradient iteration. >*/
{
/* Get handle to the CUBLAS context */
cublasHandle_t cublasHandle = 0;
cublasStatus_t cublasStatus;
cublasStatus=cublasCreate(&cublasHandle);
dim3 dimBlock_for_cg(BLOCK_SIZE_XX,1);
dim3 dimGrid_mod((size_mod+dimBlock_for_cg.x-1)/dimBlock_for_cg.x,1);
dim3 dimGrid_data((size_data+dimBlock_for_cg.x-1)/dimBlock_for_cg.x,1);
float sds, gdg, gds, determ, gdr, sdr, alfa, beta,m,dpg,dpr,test;
m=0.0;
if (iter == 0)
{
cublasStatus = cublasSdot(cublasHandle, size_data, gg_d, 1, gg_d, 1, &test);
if(test==0) printf("cgstep: grad vanishes identically");
cudaMemset(S_d,0,size_mod*sizeof(float));
cudaMemset(Ss_d,0,size_data*sizeof(float));
cublasStatus = cublasSdot(cublasHandle, size_data, gg_d, 1, rr_d, 1, &gdr);
cublasStatus = cublasSdot(cublasHandle, size_data, gg_d, 1, gg_d, 1, &gdg);
alfa=gdr/gdg;
beta=0.0;
dpr = 1.;
dpg = 1.;
}
else
{
cublasStatus = cublasSdot(cublasHandle, size_data, gg_d, 1, gg_d, 1, &gdg);
cublasStatus = cublasSdot(cublasHandle, size_data, Ss_d, 1, Ss_d, 1, &sds);
cublasStatus = cublasSdot(cublasHandle, size_data, gg_d, 1, Ss_d, 1, &gds);
determ=gdg*sds-gds*gds+(0.00001*(gdg*sds)+1.e-15);
cublasStatus = cublasSdot(cublasHandle, size_data, gg_d, 1, rr_d, 1, &gdr);
cublasStatus = cublasSdot(cublasHandle, size_data, Ss_d, 1, rr_d, 1, &sdr);
alfa = ( sds * gdr - gds * sdr ) / determ;
beta = (-gds * gdr + gdg * sdr ) / determ;
cublasStatus = cublasSdot(cublasHandle, size_data, rr_d, 1, rr_d, 1, &dpr);
dpr=dpr/dpr0;
cublasStatus = cublasSdot(cublasHandle, size_mod, g_d, 1, g_d, 1, &dpg);
dpg=dpg/dpg0;
}
cublasStatus = cublasSdot(cublasHandle, size_mod, mod_d, 1, mod_d, 1, &m);
printf ("GPU-iteration %d res %f mod %f grad %f\n",iter+1, dpr, m, dpg);
// s = alpha*g + beta*s;
cublasStatus = cublasSscal(cublasHandle, size_mod, &beta, S_d, 1);
cublasStatus = cublasSaxpy(cublasHandle, size_mod, &alfa, g_d, 1, S_d, 1);
// ss = alpha*gg + beta*ss;
cublasStatus = cublasSscal(cublasHandle, size_data, &beta, Ss_d, 1);
cublasStatus = cublasSaxpy(cublasHandle, size_data, &alfa, gg_d, 1, Ss_d, 1);
float temp=1.0,temp2=-1.0;
cublasStatus = cublasSaxpy(cublasHandle, size_mod, &temp, S_d, 1, mod_d, 1);
cublasStatus = cublasSaxpy(cublasHandle, size_data, &temp2, Ss_d, 1, rr_d, 1);
cublasDestroy(cublasHandle);
}