#include <c10d/ProcessGroupGloo.hpp>
#include <netdb.h>
#include <sys/socket.h>
#include <sys/types.h>
#include <unistd.h>
#include <type_traits>
#include <gloo/allgather.h>
#include <gloo/allgatherv.h>
#include <gloo/allreduce.h>
#include <gloo/barrier.h>
#include <gloo/broadcast.h>
#include <gloo/gather.h>
#include <gloo/reduce.h>
#include <gloo/scatter.h>
#include <ATen/SparseTensorUtils.h>
#ifdef USE_CUDA
#include <ATen/cuda/CUDAEvent.h>
#include <ATen/cuda/Exceptions.h>
#include <ATen/cuda/PinnedMemoryAllocator.h>
#include <c10/cuda/CUDACachingAllocator.h>
#include <c10/cuda/CUDAGuard.h>
#include <c10/cuda/CUDAStream.h>
#endif
#include <gloo/config.h>
#include <gloo/rendezvous/context.h>
#include <gloo/rendezvous/prefix_store.h>
#if GLOO_HAVE_TRANSPORT_TCP
#include <gloo/transport/tcp/device.h>
#endif
#if GLOO_HAVE_TRANSPORT_UV
#include <gloo/transport/uv/device.h>
#endif
// On Linux, check that the tcp transport is available.
#ifdef __linux__
#if !GLOO_HAVE_TRANSPORT_TCP
#error "Expected the tcp transport to be available on Linux."
#endif
#endif
// On macOS, check that the uv transport is available.
#ifdef __APPLE__
#if !GLOO_HAVE_TRANSPORT_UV
#error "Expected the uv transport to be available on macOS."
#endif
#endif
#define GENERATE_ALL_TYPES(type, func, args...) \
switch (type) { \
case ::at::ScalarType::Float: \
func<float>(args); \
break; \
case ::at::ScalarType::Double: \
func<double>(args); \
break; \
case ::at::ScalarType::Half: \
func<gloo::float16>(args); \
break; \
case ::at::ScalarType::Char: \
func<int8_t>(args); \
break; \
case ::at::ScalarType::Byte: \
func<uint8_t>(args); \
break; \
case ::at::ScalarType::Int: \
func<int32_t>(args); \
break; \
case ::at::ScalarType::Long: \
func<int64_t>(args); \
break; \
default: \
throw std::runtime_error("Invalid scalar type"); \
}
namespace c10d {
namespace {
// Wrap c10d store as Gloo store
class GlooStore : public ::gloo::rendezvous::Store {
public:
GlooStore(const std::shared_ptr<::c10d::Store>& store) : store_(store) {}
void set(const std::string& key, const std::vector<char>& value) override {
std::vector<uint8_t> tmp(value.begin(), value.end());
store_->set(key, tmp);
}
std::vector<char> get(const std::string& key) override {
auto value = store_->get(key);
return std::vector<char>(value.begin(), value.end());
}
void wait(const std::vector<std::string>& keys) override {
store_->wait(keys, Store::kDefaultTimeout);
}
void wait(
const std::vector<std::string>& keys,
const std::chrono::milliseconds& timeout) override {
store_->wait(keys, timeout);
}
protected:
std::shared_ptr<::c10d::Store> store_;
};
typedef void (*ReduceFunc)(void*, const void*, const void*, size_t);
template <
typename T,
typename std::enable_if<!std::is_integral<T>::value, int>::type = 0>
ReduceFunc toFunction(const ReduceOp& r) {
switch (r) {
case ReduceOp::SUM:
return ReduceFunc(&::gloo::sum<T>);
case ReduceOp::PRODUCT:
return ReduceFunc(&::gloo::product<T>);
case ReduceOp::MIN:
return ReduceFunc(&::gloo::min<T>);
case ReduceOp::MAX:
return ReduceFunc(&::gloo::max<T>);
case ReduceOp::BAND:
throw std::runtime_error(
"Cannot use ReduceOp.BAND with non-integral dtype");
break;
case ReduceOp::BOR:
throw std::runtime_error(
"Cannot use ReduceOp.BOR with non-integral dtype");
break;
case ReduceOp::BXOR:
throw std::runtime_error(
"Cannot use ReduceOp.BXOR with non-integral dtype");
break;
case ReduceOp::UNUSED:
break;
}
throw std::runtime_error("Unhandled ReduceOp");
}
// Bitwise AND with SFINAE guard for integral types.
template <
typename T,
typename std::enable_if<std::is_integral<T>::value, int>::type = 0>
void band(void* c, const void* a, const void* b, size_t n) {
auto tc = static_cast<T*>(c);
auto ta = static_cast<const T*>(a);
auto tb = static_cast<const T*>(b);
for (size_t i = 0; i < n; i++) {
tc[i] = ta[i] & tb[i];
}
}
// Bitwise OR with SFINAE guard for integral types.
template <
typename T,
typename std::enable_if<std::is_integral<T>::value, int>::type = 0>
void bor(void* c, const void* a, const void* b, size_t n) {
auto tc = static_cast<T*>(c);
auto ta = static_cast<const T*>(a);
auto tb = static_cast<const T*>(b);
for (size_t i = 0; i < n; i++) {
tc[i] = ta[i] | tb[i];
}
}
// Bitwise XOR with SFINAE guard for integral types.
template <
typename T,
typename std::enable_if<std::is_integral<T>::value, int>::type = 0>
void bxor(void* c, const void* a, const void* b, size_t n) {
auto tc = static_cast<T*>(c);
auto ta = static_cast<const T*>(a);
auto tb = static_cast<const T*>(b);
for (size_t i = 0; i < n; i++) {
tc[i] = ta[i] ^ tb[i];
}
}
template <
typename T,
typename std::enable_if<std::is_integral<T>::value, int>::type = 0>
ReduceFunc toFunction(const ReduceOp& r) {
switch (r) {
case ReduceOp::SUM:
return ReduceFunc(&::gloo::sum<T>);
case ReduceOp::PRODUCT:
return ReduceFunc(&::gloo::product<T>);
case ReduceOp::MIN:
return ReduceFunc(&::gloo::min<T>);
case ReduceOp::MAX:
return ReduceFunc(&::gloo::max<T>);
case ReduceOp::BAND:
return ReduceFunc(&band<T>);
case ReduceOp::BOR:
return ReduceFunc(&bor<T>);
case ReduceOp::BXOR:
return ReduceFunc(&bxor<T>);
case ReduceOp::UNUSED:
break;
}
throw std::runtime_error("Unhandled ReduceOp");
}
template <typename T, typename O>
void setInputs(O& opts, std::vector<at::Tensor>& tensors) {
opts.setInputs(getDataPointers<T>(tensors), tensors[0].numel());
}
template <typename T, typename O>
void setInput(O& opts, at::Tensor& tensor) {
opts.setInput(getDataPointer<T>(tensor), tensor.numel());
}
template <typename T, typename O>
void setOutputs(O& opts, std::vector<at::Tensor>& tensors) {
opts.setOutputs(getDataPointers<T>(tensors), tensors[0].numel());
}
template <typename T, typename O>
void setOutput(O& opts, at::Tensor& tensor) {
opts.setOutput(getDataPointer<T>(tensor), tensor.numel());
}
template <typename T, typename O>
void setOutput(O& opts, at::Tensor& tensor, std::vector<size_t>& counts) {
opts.setOutput(getDataPointer<T>(tensor), counts);
}
#ifdef USE_CUDA
at::Tensor pinnedLike(at::Tensor& tensor) {
auto* allocator = at::cuda::getPinnedMemoryAllocator();
auto storage = c10::Storage(
tensor.dtype(),
at::detail::computeStorageSize(tensor.sizes(), tensor.strides()),
allocator,
/*resizable=*/false);
return at::empty({0}, tensor.options().device(at::kCPU))
.set_(storage, 0, tensor.sizes(), tensor.strides());
}
// This function initializes a vector of CUDA streams, one for every
// tensor in the input tensor vector, and ensures that these streams are
// synchr
没有合适的资源?快使用搜索试试~ 我知道了~
资源推荐
资源详情
资源评论
收起资源包目录
libtorch.zip (1639个子文件)
ProcessGroupGloo.cpp 76KB
THTensorConv.cpp 69KB
ProcessGroupNCCL.cpp 28KB
THTensorMath.cpp 28KB
THTensor.cpp 25KB
ProcessGroupMPI.cpp 21KB
THTensorLapack.cpp 17KB
THTensorRandom.cpp 17KB
THBlas.cpp 14KB
ProcessGroupNCCLTest.cpp 14KB
TCPStore.cpp 13KB
ProcessGroupGlooTest.cpp 11KB
ProcessGroupMPITest.cpp 11KB
Utils.cpp 10KB
FileStore.cpp 9KB
THVectorDispatch.cpp 8KB
ProcessGroupGlooAsyncTest.cpp 8KB
ProcessGroupNCCLErrorsTest.cpp 5KB
THLapack.cpp 5KB
manager.cpp 4KB
THStorage.cpp 4KB
core.cpp 4KB
TCPStoreTest.cpp 3KB
FileStoreTest.cpp 2KB
HashStore.cpp 2KB
HashStoreTest.cpp 2KB
ProcessGroup.cpp 2KB
THStorageCopy.cpp 2KB
PrefixStore.cpp 1KB
NCCLUtils.cpp 1KB
allreduce.cpp 895B
core.cpp 760B
THTensorFill.cpp 701B
Store.cpp 386B
CUDATest.cu 578B
CUDAApplyUtils.cuh 46KB
NumericLimits.cuh 5KB
TensorInfo.cuh 3KB
OffsetCalculator.cuh 2KB
IndexUtils.cuh 780B
CUDATensorMethods.cuh 285B
torch.dll 37.6MB
c10.dll 276KB
Functions.h 985KB
descriptor.pb.h 497KB
caffe2.pb.h 312KB
TensorMethods.h 289KB
vulkan.h 196KB
NativeFunctions.h 180KB
Functions.h 176KB
RegistrationDeclarations.h 172KB
python_torch_functions_dispatch.h 147KB
variant.h 105KB
torch.pb.h 97KB
pybind11.h 93KB
repeated_field.h 93KB
variable_factories.h 91KB
cast.h 89KB
type.pb.h 89KB
descriptor.h 89KB
NeuralNetworks.h 83KB
extension_set.h 73KB
metanet.pb.h 73KB
segment_reduction_op.h 71KB
CUDAType.h 66KB
numpy.h 66KB
order_preserving_flat_hash_map.h 66KB
TensorImpl.h 65KB
python_variable_methods_dispatch.h 65KB
TypeDefault.h 63KB
flat_hash_map.h 62KB
cl.h 61KB
prof_dag.pb.h 60KB
predictor_consts.pb.h 60KB
operator.h 60KB
coded_stream.h 57KB
plugin.pb.h 57KB
pytypes.h 57KB
CPUType.h 56KB
message.h 54KB
wrappers.pb.h 52KB
jit_type.h 49KB
utility_ops.h 48KB
TensorBody.h 47KB
image_input_op.h 47KB
psimd.h 46KB
mpscnn_kernels.h 45KB
map.h 44KB
api.pb.h 44KB
message_differencer.h 42KB
wire_format_lite_inl.h 42KB
cpuinfo.h 42KB
cl_platform.h 41KB
hsm.pb.h 41KB
ir.h 41KB
wire_format_lite.h 41KB
map_type_handler.h 38KB
common.h 37KB
struct.pb.h 37KB
video_input_op.h 35KB
共 1639 条
- 1
- 2
- 3
- 4
- 5
- 6
- 17
丛继晔
- 粉丝: 67
- 资源: 2
上传资源 快速赚钱
- 我的内容管理 展开
- 我的资源 快来上传第一个资源
- 我的收益 登录查看自己的收益
- 我的积分 登录查看自己的积分
- 我的C币 登录后查看C币余额
- 我的收藏
- 我的下载
- 下载帮助
最新资源
- 基于Matlab人脸肤色定理的教师人数统计+源代码+全部数据+文档说明+详细注释+使用说明+截图(高分课程设计)
- 基于Matlab霍夫曼变换的表盘读数识别+源代码+全部数据+文档说明+详细注释+使用说明+截图(高分课程设计)
- 基于Matlab火灾烟雾检测源码带GUI界面+源代码+全部数据+文档说明+详细注释+使用说明+截图(高分课程设计)
- 基于Matlab的恶劣天气交通标志识别系统+源代码+全部数据+文档说明+详细注释+使用说明+截图(高分课程设计)
- 基于MATLAB的霍夫曼变换的表盘示数识别+源代码+全部数据+文档说明+详细注释+使用说明+截图(高分课程设计)
- 基于Matlab的车道线识别系统 +源代码+全部数据+文档说明+详细注释+使用说明+截图(高分课程设计)
- 基于MATLAB的教室人数统计系统带Gui界面+源代码+全部数据+文档说明+详细注释+使用说明+截图(高分课程设计)
- 基于MATLAB的教室人数统计系统带Gui界面+源代码+全部数据+文档说明+详细注释+使用说明+截图(高分课程设计)
- 基于MATLAB 的霍夫曼变换答题卡识别源码+全部数据+文档说明+详细注释+使用说明+截图(高分课程设计)
- 基于Matlab+bp神经网络的神经网络汉字识别系统+源代码+全部数据+文档说明+详细注释+使用说明+截图(高分课程设计)
资源上传下载、课程学习等过程中有任何疑问或建议,欢迎提出宝贵意见哦~我们会及时处理!
点击此处反馈
安全验证
文档复制为VIP权益,开通VIP直接复制
信息提交成功
- 1
- 2
前往页