/***************************************************************************************************
* Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
/*
\file
\brief Matrix classes with value semantics.
*/
#pragma once
#if !defined(__CUDACC_RTC__)
#include <iosfwd>
#include <cmath>
#endif
#include "cutlass/cutlass.h"
#include "cutlass/array.h"
#include "cutlass/coord.h"
#include "cutlass/fast_math.h"
#include "cutlass/layout/matrix.h"
namespace cutlass {
/////////////////////////////////////////////////////////////////////////////////////////////////
/// Primary template with partial specializations to follow
template <typename Element, int Rows, int Columns> struct Matrix;
/////////////////////////////////////////////////////////////////////////////////////////////////
/// 1-by-2 matrix template class definition
template <typename Element_>
struct Matrix<Element_, 1, 2> {
//
// Type definitions
//
/// Element data type
using Element = Element_;
/// Number of rows in matrix
static int const kRows = 1;
/// Number of columns in matrix
static int const kColumns = 2;
/// Layout of matrix in underlying array
using Layout = layout::RowMajor;
/// Number of elements in matrix
static int const kCount = 2;
//
// Data members
//
/// Elements of the matrix in row-major layout
Array<Element, kCount> data;
//
// Methods
//
/// Constructs a zero matrix
CUTLASS_HOST_DEVICE
Matrix() {
data.clear();
}
/// Copy constructor for a 1-by-2 matrix
CUTLASS_HOST_DEVICE
Matrix(Matrix const &rhs) {
data = rhs.data;
}
/// Constucts a 1-by-2 matrix from scalar elements
CUTLASS_HOST_DEVICE
Matrix(
Element _0_0, Element _0_1
) {
data[0] = _0_0; data[1] = _0_1;
}
/// Constructs a matrix from a uniform element
CUTLASS_HOST_DEVICE
static Matrix uniform(Element s) {
Matrix m;
m.data[0] = s;
m.data[1] = s;
return m;
}
/// Constructs a matrix from a uniform element 1
CUTLASS_HOST_DEVICE
static Matrix ones() {
return uniform(Element(1));
}
/// Constructs a matrix from a uniform element 0
CUTLASS_HOST_DEVICE
static Matrix zero() {
return Matrix();
}
/// Returns a transposed matrix
CUTLASS_HOST_DEVICE
Matrix<Element, 2, 1> transpose() const {
Matrix<Element, 2, 1> mt;
mt.data[0] = data[0];
mt.data[1] = data[1];
return mt;
}
/// Accesses an element by coordinate
CUTLASS_HOST_DEVICE
Element at(int i, int j) const {
return data[i * 1 + j];
}
/// Accesses an element by coordinate
CUTLASS_HOST_DEVICE
Element & at(int i, int j) {
return data[i * 1 + j];
}
/// Accesses an element by coordinate
CUTLASS_HOST_DEVICE
Element at(Coord<2> const &coord) const {
return at(coord[0], coord[1]);
}
/// Accesses an element by coordinate
CUTLASS_HOST_DEVICE
Element & at(Coord<2> const &coord) {
return at(coord[0], coord[1]);
}
/// Accesses an element by offset
CUTLASS_HOST_DEVICE
Element &at(int offset) {
return data[offset];
}
/// Accesses an element by offset
CUTLASS_HOST_DEVICE
Element at(int offset) const {
return data[offset];
}
/// Accesses an element by coordinate
CUTLASS_HOST_DEVICE
Element operator[](Coord<2> const &coord) const {
return at(coord[0], coord[1]);
}
/// Accesses an element by coordinate
CUTLASS_HOST_DEVICE
Element & operator[](Coord<2> const &coord) {
return at(coord[0], coord[1]);
}
/// Accesses an element by offset
CUTLASS_HOST_DEVICE
Element & operator[](int offset) {
return data[offset];
}
/// Accesses an element by offset
CUTLASS_HOST_DEVICE
Element operator[](int offset) const {
return data[offset];
}
/// Gets a submatrix with optional offset
CUTLASS_HOST_DEVICE
Matrix<Element, 1, 2> slice_1x2(int i = 0, int j = 0) const {
Matrix<Element, 1, 2> m;
m.data[0] = data[i * 2 + j + 0];
m.data[1] = data[i * 2 + j + 1];
return m;
}
/// Overwrites a submatrix with optional offset
CUTLASS_HOST_DEVICE
Matrix & set_slice_1x2(Matrix<Element, 1, 2> const &m, int i = 0, int j = 0) {
data[i * 2 + j + 0] = m.data[0];
data[i * 2 + j + 1] = m.data[1];
return *this;
}
CUTLASS_HOST_DEVICE
Matrix<Element, 1, 2> row(int i) const {
return slice_1x2(i, 0);
}
CUTLASS_HOST_DEVICE
Matrix &set_row(Matrix<Element, 1, 2> const &v, int i = 0) {
return set_slice_1x2(v, i, 0);
}
/// Forms a 1-by-2 matrix by horizontally concatenating an Element with an Element
CUTLASS_HOST_DEVICE
static Matrix hcat(Element lhs, Element rhs) {
return Matrix(
lhs, rhs);
}
/// Concatenates this matrix with a an Element to form a 1-by-3 matrix
CUTLASS_HOST_DEVICE
Matrix<Element, 1, 3> hcat(Element rhs) const {
return Matrix<Element, 1, 3>::hcat(*this, rhs);
}
/// Concatenates this matrix with a a 1-by-2 matrix to form a 1-by-4 matrix
CUTLASS_HOST_DEVICE
Matrix<Element, 1, 4> hcat(Matrix<Element, 1, 2> const & rhs) const {
return Matrix<Element, 1, 4>::hcat(*this, rhs);
}
/// Concatenates this matrix with a a 1-by-2 matrix to form a 2-by-2 matrix
CUTLASS_HOST_DEVICE
Matrix<Element, 2, 2> vcat(Matrix<Element, 1, 2> const & rhs) const {
return Matrix<Element, 2, 2>::vcat(*this, rhs);
}
/// Concatenates this matrix with a a 2-by-2 matrix to form a 3-by-2 matrix
CUTLASS_HOST_DEVICE
Matrix<Element, 3, 2> vcat(Matrix<Element, 2, 2> const & rhs) const {
return Matrix<Element, 3, 2>::vcat(*this, rhs);
}
/// Concatenates this matrix with a a 3-by-2 matrix to form a 4-by-2 matrix
CUTLASS_HOST_DEVICE
Matrix<Element, 4, 2> vcat(Matrix<Element, 3, 2> const & rhs) const {
return Matrix<Element, 4, 2>::vcat(*this, rhs);
}
/// Elementwise add operator (1-by-2)
CUTLASS_HOST_DEVICE
Matrix add(Matrix const &rhs) const {
Matrix result;
result.data[0] = data[0] + rhs.data[0];
resu
没有合适的资源?快使用搜索试试~ 我知道了~
Linuxtiny-cuda-nn直接安装
共2000个文件
html:1278个
h:545个
js:76个
需积分: 5 29 下载量 116 浏览量
2023-11-19
20:29:54
上传
评论 2
收藏 160.27MB ZIP 举报
温馨提示
根据配套的文章命令,就可以轻松的在自己电脑上(服务器linux上)安装成功tiny-cuda-nn啦!!git不下来?没关系,我下载好啦!安装总是报错?没关系,我下载的这个是全套完整的!!跟着安装命令步骤来,准没错!芜湖~~
资源推荐
资源详情
资源评论
收起资源包目录
Linuxtiny-cuda-nn直接安装 (2000个子文件)
problem_space.cpp 38KB
cudnn_helpers.cpp 17KB
performance_report.cpp 14KB
enumerated_types.cpp 8KB
visualize_layout.cpp 6KB
gpu_timer.cpp 4KB
manifest.cpp 4KB
main.cpp 2KB
stbi_wrapper.cpp 1KB
doxygen.css 27KB
matrix.h 369KB
stb_image.h 281KB
mma_tensor_op_tile_iterator.h 137KB
default_mma_core_sm80.h 103KB
mma_tensor_op_tile_iterator_sm70.h 100KB
mma_complex_tensor_op_tile_iterator_sm80.h 79KB
mma_tensor_op_tile_iterator_sm80.h 76KB
stb_image_write.h 71KB
predicated_tile_access_iterator.h 70KB
default_multistage_mma_complex_core_sm80.h 64KB
common_device.h 63KB
predicated_tile_iterator.h 62KB
functional.h 62KB
mma_simt_tile_iterator.h 60KB
default_mma_core_simt.h 58KB
default_conv2d_fprop.h 57KB
mma_sm80.h 55KB
default_conv2d_dgrad.h 54KB
convolution.h 48KB
regular_tile_access_iterator_tensor_op_sm80.h 48KB
constants.h 48KB
tensor_fill.h 47KB
numeric_conversion.h 46KB
regular_tile_iterator_tensor_op_sm70.h 44KB
grid.h 44KB
vec.h 43KB
mma_sparse_sm80.h 43KB
gemm_operation.h 43KB
default_mma_core_sm75.h 43KB
tensor_fill.h 42KB
library.h 39KB
predicated_tile_iterator.h 37KB
regular_tile_iterator_tensor_op.h 36KB
gemm_with_softmax.h 35KB
matrix.h 35KB
default_gemm.h 34KB
default_mma.h 34KB
shampoo.h 34KB
b2b_mma_multistage.h 34KB
b2b_mma_multistage_smem_accumulator.h 33KB
tensor_op_multiplicand_sm75.h 33KB
pitch_linear_thread_map.h 33KB
default_mma_core_sparse_sm80.h 32KB
b2b_implicit_gemm_multistage.h 32KB
predicated_tile_access_iterator_triangular_matrix.h 32KB
b2b_implicit_gemm_multistage_smem_accumulator.h 32KB
mma_sm60.h 30KB
mma_sm75.h 30KB
conv2d_params.h 30KB
tensor_op_multiplicand_sm70.h 30KB
implicit_gemm_fprop_fusion_multistage.h 30KB
tensor_op_multiplicand_sm80.h 30KB
epilogue_with_reduction.h 29KB
default_conv2d_wgrad.h 29KB
epilogue_with_broadcast.h 29KB
default_b2b_conv2d_fprop_smem_accumulator_sm75.h 29KB
convolution.h 29KB
predicated_tile_access_iterator_2dthreadtile.h 28KB
b2b_interleaved_conv2d_run.h 28KB
predicated_tile_iterator_triangular_matrix.h 28KB
default_b2b_conv2d_fprop_smem_accumulator_sm80.h 28KB
mma_complex_tensor_op.h 28KB
problem_space.h 28KB
regular_tile_access_iterator_tensor_op.h 28KB
mma_blas3_multistage.h 28KB
default_b2b_mma_smem_accumulator.h 27KB
predicated_tile_iterator_2dthreadtile.h 27KB
mma_tensor_op_tile_iterator_wmma.h 27KB
default_b2b_mma.h 27KB
default_b2b_conv2d_fprop_sm80.h 27KB
default_b2b_conv2d_fprop_sm75.h 27KB
trmm.h 27KB
implicit_gemm_wgrad_fusion_multistage.h 26KB
b2b_conv2d_run.h 26KB
conv2d_dgrad_output_gradient_tile_access_iterator_optimized.h 26KB
platform.h 26KB
mma_sparse_multistage.h 26KB
default_multistage_trmm_complex.h 25KB
gemm_with_fused_epilogue.h 25KB
gemm.h 25KB
b2b_interleaved_gemm_run.h 25KB
fast_math.h 25KB
rank_2k_universal.h 24KB
gemm_with_k_reduction.h 24KB
gpu_memory.h 24KB
half.h 24KB
symm_universal.h 24KB
gemm_universal.h 24KB
gemm_planar_complex.h 24KB
linear_combination_clamp.h 24KB
共 2000 条
- 1
- 2
- 3
- 4
- 5
- 6
- 20
资源评论
小秋今天也要加油吖
- 粉丝: 5
- 资源: 2
上传资源 快速赚钱
- 我的内容管理 展开
- 我的资源 快来上传第一个资源
- 我的收益 登录查看自己的收益
- 我的积分 登录查看自己的积分
- 我的C币 登录后查看C币余额
- 我的收藏
- 我的下载
- 下载帮助
最新资源
- (源码)基于SimPy和贝叶斯优化的流程仿真系统.zip
- (源码)基于Java Web的个人信息管理系统.zip
- (源码)基于C++和OTL4的PostgreSQL数据库连接系统.zip
- (源码)基于ESP32和AWS IoT Core的室内温湿度监测系统.zip
- (源码)基于Arduino的I2C协议交通灯模拟系统.zip
- coco.names 文件
- (源码)基于Spring Boot和Vue的房屋租赁管理系统.zip
- (源码)基于Android的饭店点菜系统.zip
- (源码)基于Android平台的权限管理系统.zip
- (源码)基于CC++和wxWidgets框架的LEGO模型火车控制系统.zip
资源上传下载、课程学习等过程中有任何疑问或建议,欢迎提出宝贵意见哦~我们会及时处理!
点击此处反馈
安全验证
文档复制为VIP权益,开通VIP直接复制
信息提交成功