/**
* Copyright (C) 2019-2021 Xilinx, Inc
*
* Licensed under the Apache License, Version 2.0 (the "License"). You may
* not use this file except in compliance with the License. A copy of the
* License is located at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations
* under the License.
*/
/*******************************************************************************
Description:
This example uses the load/compute/store coding style which is generally
the most efficient for implementing kernels using HLS. The load and store
functions are responsible for moving data in and out of the kernel as
efficiently as possible. The core functionality is decomposed across one
of more compute functions. Whenever possible, the compute function should
pass data through HLS streams and should contain a single set of nested loops.
HLS stream objects are used to pass data between producer and consumer
functions. Stream read and write operations have a blocking behavior which
allows consumers and producers to synchronize with each other automatically.
The dataflow pragma instructs the compiler to enable task-level pipelining.
This is required for to load/compute/store functions to execute in a parallel
and pipelined manner.
The kernel operates on vectors of NUM_WORDS integers modeled using the hls::vector
data type. This datatype provides intuitive support for parallelism and
fits well the vector-add computation. The vector length is set to NUM_WORDS
since NUM_WORDS integers amount to a total of 64 bytes, which is the maximum size of
a kernel port. It is a good practice to match the compute bandwidth to the I/O
bandwidth. Here the kernel loads, computes and stores NUM_WORDS integer values per
clock cycle and is implemented as below:
_____________
| |<----- Input Vector 1 from Global Memory
| load_input | __
|_____________|----->| |
_____________ | | in1_stream
Input Vector 2 from Global Memory --->| | |__|
__ | load_input | |
| |<---|_____________| |
in2_stream | | _____________ |
|__|--->| |<--------
| compute_add | __
|_____________|---->| |
______________ | | out_stream
| |<---|__|
| store_result |
|______________|-----> Output result to Global Memory
*******************************************************************************/
// Includes
#include <hls_vector.h>
#include <hls_stream.h>
#include "assert.h"
#define MEMORY_DWIDTH 512
#define SIZEOF_WORD 4
#define NUM_WORDS ((MEMORY_DWIDTH) / (8 * SIZEOF_WORD))
#define DATA_SIZE 4096
// TRIPCOUNT identifier
const int c_size = DATA_SIZE;
static void load_input(hls::vector<uint32_t, NUM_WORDS>* in,
hls::stream<hls::vector<uint32_t, NUM_WORDS> >& inStream,
int vSize) {
mem_rd:
for (int i = 0; i < vSize; i++) {
#pragma HLS LOOP_TRIPCOUNT min = c_size max = c_size
inStream << in[i];
}
}
static void compute_add(hls::stream<hls::vector<uint32_t, NUM_WORDS> >& in1_stream,
hls::stream<hls::vector<uint32_t, NUM_WORDS> >& in2_stream,
hls::stream<hls::vector<uint32_t, NUM_WORDS> >& out_stream,
int vSize) {
// The kernel is operating with vector of NUM_WORDS integers. The + operator performs
// an element-wise add, resulting in NUM_WORDS parallel additions.
execute:
for (int i = 0; i < vSize; i++) {
#pragma HLS LOOP_TRIPCOUNT min = c_size max = c_size
out_stream << (in1_stream.read() + in2_stream.read());
}
}
static void store_result(hls::vector<uint32_t, NUM_WORDS>* out,
hls::stream<hls::vector<uint32_t, NUM_WORDS> >& out_stream,
int vSize) {
mem_wr:
for (int i = 0; i < vSize; i++) {
#pragma HLS LOOP_TRIPCOUNT min = c_size max = c_size
out[i] = out_stream.read();
}
}
extern "C" {
/*
Vector Addition Kernel
Arguments:
in1 (input) --> Input vector 1
in2 (input) --> Input vector 2
out (output) --> Output vector
size (input) --> Number of elements in vector
*/
void vadd(hls::vector<uint32_t, NUM_WORDS>* in1,
hls::vector<uint32_t, NUM_WORDS>* in2,
hls::vector<uint32_t, NUM_WORDS>* out,
int size) {
#pragma HLS INTERFACE m_axi port = in1 bundle = gmem0
#pragma HLS INTERFACE m_axi port = in2 bundle = gmem1
#pragma HLS INTERFACE m_axi port = out bundle = gmem0
static hls::stream<hls::vector<uint32_t, NUM_WORDS> > in1_stream("input_stream_1");
static hls::stream<hls::vector<uint32_t, NUM_WORDS> > in2_stream("input_stream_2");
static hls::stream<hls::vector<uint32_t, NUM_WORDS> > out_stream("output_stream");
// Since NUM_WORDS values are processed
// in parallel per loop iteration, the for loop only needs to iterate 'size / NUM_WORDS' times.
assert(size % NUM_WORDS == 0);
int vSize = size / NUM_WORDS;
#pragma HLS dataflow
load_input(in1, in1_stream, vSize);
load_input(in2, in2_stream, vSize);
compute_add(in1_stream, in2_stream, out_stream, vSize);
store_result(out, out_stream, vSize);
}
}
没有合适的资源?快使用搜索试试~ 我知道了~
温馨提示
给予Xilinx FPGA软件开发环境。 使用xilinx官方vitis example创建的向量加法demo,包括Host、Kernel、Link三部分的工程。适合Alveo加速卡的初学者和小白学习项目结构和加速原理。 Host部分使用OpenCL,Kernel使用 HSL。 目标平台:Xilinx Alveo U50
资源推荐
资源详情
资源评论
收起资源包目录
Demo1_system.ide.zip (28个子文件)
Demo1
.cproject 18KB
.settings
language.settings.xml 4KB
src
host.cpp 6KB
README.rst 3KB
qor.json 915B
Demo1.prj 1KB
libs
common
includes
xcl2
xcl2.cpp 5KB
xcl2.hpp 4KB
xrt.ini 65B
details.rst 2KB
.project 891B
.gitignore 41B
Demo1_system_hw_link
.cproject 9KB
Demo1_system_hw_link.prj 3KB
.project 866B
Demo1_kernels
.cproject 10KB
src
vadd.cpp 6KB
assert.h 5KB
libs
common
includes
xcl2
xcl2.hpp 4KB
Demo1_kernels.prj 3KB
.project 826B
Demo1_system
.cproject 9KB
.project 984B
.gitignore 41B
Demo1_system.sprj 3KB
_ide
launch
SystemDebugger_Demo1_system.launch 6KB
Demo1_system-Default.launch 9KB
sdx_export_metadata
export.json 1KB
共 28 条
- 1
资源评论
小强不吃菜
- 粉丝: 10
- 资源: 17
上传资源 快速赚钱
- 我的内容管理 展开
- 我的资源 快来上传第一个资源
- 我的收益 登录查看自己的收益
- 我的积分 登录查看自己的积分
- 我的C币 登录后查看C币余额
- 我的收藏
- 我的下载
- 下载帮助
安全验证
文档复制为VIP权益,开通VIP直接复制
信息提交成功