#include <torch/all.h>
#include <torch/python.h>
#include <c10/cuda/CUDAGuard.h>
// adapted from https://github.com/PanQiWei/AutoGPTQ/blob/main/autogptq_extension/cuda_256/autogptq_cuda_256.cpp
void vecquant8matmul_cuda(
torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
torch::Tensor scales, torch::Tensor zeros,
torch::Tensor g_idx
);
void vecquant8matmul(
torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
torch::Tensor scales, torch::Tensor zeros,
torch::Tensor g_idx
) {
const at::cuda::OptionalCUDAGuard device_guard(device_of(vec));
vecquant8matmul_cuda(vec, mat, mul, scales, zeros, g_idx);
}
void vecquant8matmul_batched_cuda(
torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
torch::Tensor scales, torch::Tensor zeros
);
void vecquant8matmul_batched(
torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
torch::Tensor scales, torch::Tensor zeros
) {
const at::cuda::OptionalCUDAGuard device_guard(device_of(vec));
vecquant8matmul_batched_cuda(vec, mat, mul, scales, zeros);
}
void vecquant8matmul_batched_column_compression_cuda(
torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
torch::Tensor scales, torch::Tensor zeros
);
void vecquant8matmul_batched_column_compression(
torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
torch::Tensor scales, torch::Tensor zeros
) {
const at::cuda::OptionalCUDAGuard device_guard(device_of(vec));
vecquant8matmul_batched_column_compression_cuda(vec, mat, mul, scales, zeros);
}
void vecquant4matmul_batched_cuda(
torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
torch::Tensor scales, torch::Tensor zeros
);
void vecquant4matmul_batched(
torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
torch::Tensor scales, torch::Tensor zeros
) {
const at::cuda::OptionalCUDAGuard device_guard(device_of(vec));
vecquant4matmul_batched_cuda(vec, mat, mul, scales, zeros);
}
void vecquant4matmul_batched_column_compression_cuda(
torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
torch::Tensor scales, torch::Tensor zeros
);
void vecquant4matmul_batched_column_compression(
torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
torch::Tensor scales, torch::Tensor zeros
) {
const at::cuda::OptionalCUDAGuard device_guard(device_of(vec));
vecquant4matmul_batched_column_compression_cuda(vec, mat, mul, scales, zeros);
}
void vecquant8matmul_batched_old_cuda(
torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
torch::Tensor scales, torch::Tensor zeros
);
void vecquant8matmul_batched_old(
torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
torch::Tensor scales, torch::Tensor zeros
) {
const at::cuda::OptionalCUDAGuard device_guard(device_of(vec));
vecquant8matmul_batched_old_cuda(vec, mat, mul, scales, zeros);
}
void vecquant4matmul_batched_old_cuda(
torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
torch::Tensor scales, torch::Tensor zeros
);
void vecquant4matmul_batched_old(
torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
torch::Tensor scales, torch::Tensor zeros
) {
const at::cuda::OptionalCUDAGuard device_guard(device_of(vec));
vecquant4matmul_batched_old_cuda(vec, mat, mul, scales, zeros);
}
void vecquant8matmul_batched_column_compression_old_cuda(
torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
torch::Tensor scales, torch::Tensor zeros
);
void vecquant8matmul_batched_column_compression_old(
torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
torch::Tensor scales, torch::Tensor zeros
) {
const at::cuda::OptionalCUDAGuard device_guard(device_of(vec));
vecquant8matmul_batched_column_compression_old_cuda(vec, mat, mul, scales, zeros);
}
void vecquant4matmul_batched_column_compression_old_cuda(
torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
torch::Tensor scales, torch::Tensor zeros
);
void vecquant4matmul_batched_column_compression_old(
torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
torch::Tensor scales, torch::Tensor zeros
) {
const at::cuda::OptionalCUDAGuard device_guard(device_of(vec));
vecquant4matmul_batched_column_compression_old_cuda(vec, mat, mul, scales, zeros);
}
void vecquant8matmul_batched_faster_cuda(
torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
torch::Tensor scales, torch::Tensor zeros
);
void vecquant8matmul_batched_faster(
torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
torch::Tensor scales, torch::Tensor zeros
) {
const at::cuda::OptionalCUDAGuard device_guard(device_of(vec));
vecquant8matmul_batched_faster_cuda(vec, mat, mul, scales, zeros);
}
void vecquant8matmul_batched_faster_old_cuda(
torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
torch::Tensor scales, torch::Tensor zeros
);
void vecquant8matmul_batched_faster_old(
torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
torch::Tensor scales, torch::Tensor zeros
) {
const at::cuda::OptionalCUDAGuard device_guard(device_of(vec));
vecquant8matmul_batched_faster_old_cuda(vec, mat, mul, scales, zeros);
}
void vecquant8matmul_batched_column_compression_faster_cuda(
torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
torch::Tensor scales, torch::Tensor zeros
);
void vecquant8matmul_batched_column_compression_faster(
torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
torch::Tensor scales, torch::Tensor zeros
) {
const at::cuda::OptionalCUDAGuard device_guard(device_of(vec));
vecquant8matmul_batched_column_compression_faster_cuda(vec, mat, mul, scales, zeros);
}
void vecquant8matmul_batched_column_compression_faster_old_cuda(
torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
torch::Tensor scales, torch::Tensor zeros
);
void vecquant8matmul_batched_column_compression_faster_old(
torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
torch::Tensor scales, torch::Tensor zeros
) {
const at::cuda::OptionalCUDAGuard device_guard(device_of(vec));
vecquant8matmul_batched_column_compression_faster_old_cuda(vec, mat, mul, scales, zeros);
}
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
m.def("vecquant8matmul", &vecquant8matmul, "Vector 8-bit Quantized Matrix Multiplication (CUDA) (desc_act)");
m.def("vecquant8matmul_batched", &vecquant8matmul_batched, "Vector 8-bit Batched Quantized Matrix Multiplication (CUDA) (desc_act)");
m.def("vecquant8matmul_batched_old", &vecquant8matmul_batched_old, "Vector 8-bit old Batched Quantized Matrix Multiplication (CUDA) (desc_act)");
m.def("vecquant8matmul_batched_faster", &vecquant8matmul_batched_faster, "Vector 8-bit old Batched Quantized Matrix Multiplication (CUDA) (desc_act)");
m.def("vecquant8matmul_batched_faster_old", &vecquant8matmul_batched_faster_old, "Vector 8-bit old Batched Quantized Matrix Multiplication (CUDA) (desc_act)");
m.def("vecquant4matmul_batched_old", &vecquant4matmul_batched_old, "Vector 4-bit old Batched Quantized Matrix Multiplication (CUDA) (desc_act)");
m.def("vecquant8matmul_batched_column_compression", &vecquant8matmul_batched_column_compression, "Vector 8-bit Batched Quantized Matrix Multiplication (CUDA) with weight's column compressed (desc_act)");
m.def("vecquant8matmul_batched_column_compression_old", &vecquant8matmul_batched_column_compression_old, "Vector old 8-bit Batched Quantized Matrix Multiplication (CUDA) with weight's column compressed (desc_act)");
m.def("vecquant8matmul_batched_column_compression_faster", &vecquant8matmul_batched_column_compression_faster, "Vector old 8-bit Batched Quantized Matrix Multiplication (CUDA) with weight's column compressed (desc_act)");
m.def("vecquant8matmul_batched_column_compression_faster_old", &vecquant8matmul_batched_column_compression_faster_old, "Vector old 8-bit Batched Quantized Matrix Multiplication (CUDA) with weight's column compressed (desc_act)");
m.def("vecquant4matmul_batched_column_compression_old", &vecquant4matmul_batched_column_compression_old, "Vector old 4-bit Batched Quantized Matrix Multiplic
没有合适的资源?快使用搜索试试~ 我知道了~
资源推荐
资源详情
资源评论
收起资源包目录
Qwe232323wewwein.zip (21个子文件)
cache_autogptq_cuda_256.cpp 8KB
.gitattributes 1KB
assets
logo.jpg 81KB
wechat.png 67KB
qwen_tokenizer.png 79KB
tokenizer.png 79KB
LICENSE 7KB
generation_config.json 222B
modeling_qwen.py 54KB
model.safetensors.index.json 14KB
qwen.tiktoken 2.44MB
cpp_kernels.py 2KB
qwen_generation_utils.py 14KB
config.json 910B
configuration_qwen.py 2KB
tokenizer_config.json 173B
cache_autogptq_cuda_kernel_256.cu 51KB
tokenization_qwen.py 9KB
model-00001-of-00002.safetensors 135B
model-00002-of-00002.safetensors 135B
NOTICE 15KB
共 21 条
- 1
资源评论
程序猿小D
- 粉丝: 4118
- 资源: 810
上传资源 快速赚钱
- 我的内容管理 展开
- 我的资源 快来上传第一个资源
- 我的收益 登录查看自己的收益
- 我的积分 登录查看自己的积分
- 我的C币 登录后查看C币余额
- 我的收藏
- 我的下载
- 下载帮助
最新资源
- 数据库设计管理课程设计系统设计报告(powerdesign+sql+DreamweaverCS)仓库管理系统设计与开发
- Visual Basic 学习教程(入门到实践)
- CocosCreator开发视频教程含源码跳一跳开发教程非Creator开发200M
- 随便写的仓库管理系统.zip,瞎看看就行
- Scratch 学习教程(入门到实践)
- CocosCreator开发视频教程含源码拼图开发3G
- CocosCreator开发视频教程含源码简易塔防开发3.61G
- 对数据集进行二分类,有数据集和源码以及模型,二分类是识别猫和不是猫的情况,可做毕业设计
- CocosCreator开发视频教程含源码多段线拖动轨迹物体2G
- Delphi 学习教程(从入门到实践)
资源上传下载、课程学习等过程中有任何疑问或建议,欢迎提出宝贵意见哦~我们会及时处理!
点击此处反馈
安全验证
文档复制为VIP权益,开通VIP直接复制
信息提交成功