constants.py 1.2 KB

1234567891011121314151617181920212223
  1. from torch._C._distributed_c10d import _DEFAULT_PG_TIMEOUT
  2. from datetime import timedelta
  3. from typing import Optional
  4. __all__ = ['default_pg_timeout', 'default_pg_nccl_timeout']
  5. # Default process group wide timeout, if applicable.
  6. # This only applies to the non-nccl backends
  7. # To make an attempt at backwards compatibility with THD, we use an
  8. # extraordinarily high default timeout, given that THD did not have timeouts.
  9. default_pg_timeout: timedelta = _DEFAULT_PG_TIMEOUT
  10. # Separate timeout for PGNCCL mainly becuase it's always been that way in the C++ layer, but until recently
  11. # there was one default that applied across all backends in the python layer.
  12. # Later, we could consider merging them back together at the c++ layer if we can align on a same value.
  13. # (only if TORCH_NCCL_BLOCKING_WAIT or TORCH_NCCL_ASYNC_ERROR_HANDLING is set to 1).
  14. try:
  15. from torch._C._distributed_c10d import _DEFAULT_PG_NCCL_TIMEOUT
  16. default_pg_nccl_timeout: Optional[timedelta] = _DEFAULT_PG_NCCL_TIMEOUT
  17. except ImportError:
  18. # if C++ NCCL support is not compiled, we don't have access to the default nccl value.
  19. # if anyone is actually trying to use nccl in this state, it should error.
  20. default_pg_nccl_timeout = None