diff --git a/cuda_bindings/cuda/bindings/_nvml.pxd b/cuda_bindings/cuda/bindings/_nvml.pxd index ddf9ab2b28..a0e6ed9ad9 100644 --- a/cuda_bindings/cuda/bindings/_nvml.pxd +++ b/cuda_bindings/cuda/bindings/_nvml.pxd @@ -173,7 +173,7 @@ cpdef str device_get_inforom_version(intptr_t device, int object) cpdef str device_get_inforom_image_version(intptr_t device) cpdef unsigned int device_get_inforom_configuration_checksum(intptr_t device) except? 0 cpdef device_validate_inforom(intptr_t device) -cpdef unsigned long device_get_last_bbx_flush_time(intptr_t device, intptr_t timestamp) except? 0 +cpdef tuple device_get_last_bbx_flush_time(intptr_t device) cpdef int device_get_display_mode(intptr_t device) except? -1 cpdef int device_get_display_active(intptr_t device) except? -1 cpdef int device_get_persistence_mode(intptr_t device) except? -1 diff --git a/cuda_bindings/cuda/bindings/_nvml.pyx b/cuda_bindings/cuda/bindings/_nvml.pyx index b9c59d6637..dbb87e8d0b 100644 --- a/cuda_bindings/cuda/bindings/_nvml.pyx +++ b/cuda_bindings/cuda/bindings/_nvml.pyx @@ -22228,23 +22228,26 @@ cpdef device_validate_inforom(intptr_t device): check_status(__status__) -cpdef unsigned long device_get_last_bbx_flush_time(intptr_t device, intptr_t timestamp) except? 0: +cpdef tuple device_get_last_bbx_flush_time(intptr_t device): """Retrieves the timestamp and the duration of the last flush of the BBX (blackbox) infoROM object during the current run. Args: device (intptr_t): The identifier of the target device. - timestamp (intptr_t): The start timestamp of the last BBX Flush. Returns: - unsigned long: The duration (us) of the last BBX Flush. + A 2-tuple containing: + + - unsigned long long: The start timestamp of the last BBX Flush. + - unsigned long: The duration (us) of the last BBX Flush. .. seealso:: `nvmlDeviceGetLastBBXFlushTime` """ + cdef unsigned long long timestamp cdef unsigned long duration_us with nogil: - __status__ = nvmlDeviceGetLastBBXFlushTime(device, timestamp, &duration_us) + __status__ = nvmlDeviceGetLastBBXFlushTime(device, ×tamp, &duration_us) check_status(__status__) - return duration_us + return (timestamp, duration_us) cpdef int device_get_display_mode(intptr_t device) except? -1: @@ -27101,8 +27104,8 @@ cpdef object system_get_topology_gpu_set(unsigned int cpuNumber): __status__ = nvmlSystemGetTopologyGpuSet(cpuNumber, count, NULL) check_status_size(__status__) if count[0] == 0: - return view.array(shape=(1,), itemsize=sizeof(intptr_t), format="i", mode="c")[:0] - cdef view.array deviceArray = view.array(shape=(count[0],), itemsize=sizeof(intptr_t), format="i", mode="c") + return view.array(shape=(1,), itemsize=sizeof(intptr_t), format="P", mode="c")[:0] + cdef view.array deviceArray = view.array(shape=(count[0],), itemsize=sizeof(intptr_t), format="P", mode="c") with nogil: __status__ = nvmlSystemGetTopologyGpuSet(cpuNumber, count, deviceArray.data) check_status(__status__) @@ -27141,8 +27144,8 @@ cpdef object unit_get_devices(intptr_t unit): __status__ = nvmlUnitGetDevices(unit, deviceCount, NULL) check_status_size(__status__) if deviceCount[0] == 0: - return view.array(shape=(1,), itemsize=sizeof(intptr_t), format="i", mode="c")[:0] - cdef view.array deviceArray = view.array(shape=(deviceCount[0],), itemsize=sizeof(intptr_t), format="i", mode="c") + return view.array(shape=(1,), itemsize=sizeof(intptr_t), format="P", mode="c")[:0] + cdef view.array deviceArray = view.array(shape=(deviceCount[0],), itemsize=sizeof(intptr_t), format="P", mode="c") with nogil: __status__ = nvmlUnitGetDevices(unit, deviceCount, deviceArray.data) check_status(__status__) @@ -27169,8 +27172,8 @@ cpdef object device_get_topology_nearest_gpus(intptr_t device, unsigned int leve ) check_status_size(__status__) if count[0] == 0: - return view.array(shape=(1,), itemsize=sizeof(intptr_t), format="i", mode="c")[:0] - cdef view.array deviceArray = view.array(shape=(deviceCount[0],), itemsize=sizeof(intptr_t), format="i", mode="c") + return view.array(shape=(1,), itemsize=sizeof(intptr_t), format="P", mode="c")[:0] + cdef view.array deviceArray = view.array(shape=(deviceCount[0],), itemsize=sizeof(intptr_t), format="P", mode="c") with nogil: __status__ = nvmlDeviceGetTopologyNearestGpus( device, @@ -27834,9 +27837,9 @@ cpdef object device_get_gpu_instances(intptr_t device, unsigned int profile_id): check_status_size(__status__) if count[0] == 0: - view.array(shape=(1,), itemsize=sizeof(intptr_t), format="i", mode="c")[:0] + view.array(shape=(1,), itemsize=sizeof(intptr_t), format="P", mode="c")[:0] - cdef view.array gpuInstances = view.array(shape=(count[0],), itemsize=sizeof(intptr_t), format="i", mode="c") + cdef view.array gpuInstances = view.array(shape=(count[0],), itemsize=sizeof(intptr_t), format="P", mode="c") with nogil: __status__ = nvmlDeviceGetGpuInstances(device, profile_id, gpuInstances.data, count) check_status(__status__) @@ -27860,9 +27863,9 @@ cpdef object gpu_instance_get_compute_instances(intptr_t gpu_instance, unsigned check_status_size(__status__) if count[0] == 0: - view.array(shape=(1,), itemsize=sizeof(intptr_t), format="i", mode="c")[:0] + view.array(shape=(1,), itemsize=sizeof(intptr_t), format="P", mode="c")[:0] - cdef view.array computeInstances = view.array(shape=(count[0],), itemsize=sizeof(intptr_t), format="i", mode="c") + cdef view.array computeInstances = view.array(shape=(count[0],), itemsize=sizeof(intptr_t), format="P", mode="c") with nogil: __status__ = nvmlGpuInstanceGetComputeInstances(gpu_instance, profile_id, computeInstances.data, count) check_status(__status__) diff --git a/cuda_core/cuda/core/system/_device.pyx b/cuda_core/cuda/core/system/_device.pyx index c9a2e8f369..b013ef79ca 100644 --- a/cuda_core/cuda/core/system/_device.pyx +++ b/cuda_core/cuda/core/system/_device.pyx @@ -13,11 +13,18 @@ from cuda.bindings import _nvml as nvml from ._nvml_context cimport initialize include "_device_utils.pxi" +include "_inforom.pxi" +AddressingMode = nvml.DeviceAddressingModeType BrandType = nvml.BrandType EventType = nvml.EventType FieldId = nvml.FieldId +GpuP2PCapsIndex = nvml.GpuP2PCapsIndex +GpuP2PStatus = nvml.GpuP2PStatus +GpuTopologyLevel = nvml.GpuTopologyLevel +InforomObject = nvml.InforomObject +PcieUtilCounter = nvml.PcieUtilCounter class DeviceArchitecture: @@ -128,52 +135,146 @@ cdef class PciInfo: """ PCI information about a GPU device. """ - cdef object _pci_info - def __init__(self, pci_info: nvml.PciInfo): - self._pci_info = pci_info + cdef object _pci_info_ext + cdef intptr_t _handle + + def __init__(self, pci_info_ext: nvml.PciInfoExt_v1, handle: int): + self._pci_info_ext = pci_info_ext + self._handle = handle @property def bus(self) -> int: """ The bus on which the device resides, 0 to 255 """ - return self._pci_info.bus + return self._pci_info_ext.bus @property def bus_id(self) -> str: """ The tuple domain:bus:device.function PCI identifier string """ - return self._pci_info.bus_id + return self._pci_info_ext.bus_id @property def device(self) -> int: """ The device's id on the bus, 0 to 31 """ - return self._pci_info.device_ + return self._pci_info_ext.device_ @property def domain(self) -> int: """ The PCI domain on which the device's bus resides, 0 to 0xffffffff """ - return self._pci_info.domain + return self._pci_info_ext.domain @property def vendor_id(self) -> int: """ The PCI vendor id of the device """ - return self._pci_info.pci_device_id & 0xFFFF + return self._pci_info_ext.pci_device_id & 0xFFFF @property def device_id(self) -> int: """ The PCI device id of the device """ - return self._pci_info.pci_device_id >> 16 + return self._pci_info_ext.pci_device_id >> 16 + + @property + def subsystem_id(self) -> int: + """ + The subsystem device ID + """ + return self._pci_info_ext.pci_sub_system_id + + @property + def base_class(self) -> int: + """ + The 8-bit PCI base class code + """ + return self._pci_info_ext.base_class + + @property + def sub_class(self) -> int: + """ + The 8-bit PCI sub class code + """ + return self._pci_info_ext.sub_class + + def get_max_pcie_link_generation(self) -> int: + """ + Retrieve the maximum PCIe link generation possible with this device and system. + + For Fermi™ or newer fully supported devices. + + For example, for a generation 2 PCIe device attached to a generation 1 + PCIe bus, the max link generation this function will report is + generation 1. + """ + return nvml.device_get_max_pcie_link_generation(self._handle) + + def get_gpu_max_pcie_link_generation(self) -> int: + """ + Retrieve the maximum PCIe link generation supported by this GPU device. + + For Fermi™ or newer fully supported devices. + """ + return nvml.device_get_gpu_max_pcie_link_generation(self._handle) + + def get_max_pcie_link_width(self) -> int: + """ + Retrieve the maximum PCIe link width possible with this device and system. + + For Fermi™ or newer fully supported devices. + + For example, for a device with a 16x PCIe bus width attached to a 8x + PCIe system bus this function will report + a max link width of 8. + """ + return nvml.device_get_max_pcie_link_width(self._handle) + + def get_current_pcie_link_generation(self) -> int: + """ + Retrieve the current PCIe link generation. + + For Fermi™ or newer fully supported devices. + """ + return nvml.device_get_curr_pcie_link_generation(self._handle) + + def get_current_pcie_link_width(self) -> int: + """ + Retrieve the current PCIe link width. + + For Fermi™ or newer fully supported devices. + """ + return nvml.device_get_curr_pcie_link_width(self._handle) + + def get_pcie_throughput(self, counter: PcieUtilCounter) -> int: + """ + Retrieve PCIe utilization information, in KB/s. + + This function is querying a byte counter over a 20ms interval, and thus + is the PCIe throughput over that interval. + + For Maxwell™ or newer fully supported devices. + + This method is not supported in virtual machines running virtual GPU + (vGPU). + """ + return nvml.device_get_pcie_throughput(self._handle, counter) + + def get_pcie_replay_counter(self) -> int: + """ + Retrieve the PCIe replay counter. + + For Kepler™ or newer fully supported devices. + """ + return nvml.device_get_pcie_replay_counter(self._handle) cdef class EventData: @@ -516,6 +617,30 @@ cdef class FieldValues: return [x.value for x in self] +cdef class RepairStatus: + """ + Repair status for TPC/Channel repair. + """ + cdef object _repair_status + + def __init__(self, handle: int): + self._repair_status = nvml.device_get_repair_status(handle) + + @property + def channel_repair_pending(self) -> bool: + """ + `True` if a channel repair is pending. + """ + return bool(self._repair_status.b_channel_repair_pending) + + @property + def tpc_repair_pending(self) -> bool: + """ + `True` if a TPC repair is pending. + """ + return bool(self._repair_status.b_tpc_repair_pending) + + cdef class Device: """ Representation of a device. @@ -579,6 +704,18 @@ cdef class Device: pci_bus_id = pci_bus_id.decode("ascii") self._handle = nvml.device_get_handle_by_pci_bus_id_v2(pci_bus_id) + @classmethod + def get_device_count(cls) -> int: + """ + Get the number of available devices. + + Returns + ------- + int + The number of available devices. + """ + return nvml.device_get_count_v2() + @classmethod def get_all_devices(cls) -> Iterable[Device]: """ @@ -589,10 +726,32 @@ cdef class Device: Iterator of Device An iterator over available devices. """ - total = nvml.device_get_count_v2() - for device_id in range(total): + for device_id in range(nvml.device_get_count_v2()): yield cls(index=device_id) + @classmethod + def get_all_devices_with_cpu_affinity(cls, cpu_index: int) -> Iterable[Device]: + """ + Retrieve the set of GPUs that have a CPU affinity with the given CPU number. + + Supported on Linux only. + + Parameters + ---------- + cpu_index: int + The CPU index. + + Returns + ------- + Iterator of Device + An iterator over available devices. + """ + cdef Device device + for handle in nvml.system_get_topology_gpu_set(cpu_index): + device = Device.__new__(Device) + device._handle = handle + yield device + @property def architecture(self) -> DeviceArchitecture: """ @@ -678,7 +837,7 @@ cdef class Device: """ The PCI attributes of this device. """ - return PciInfo(nvml.device_get_pci_info_v3(self._handle)) + return PciInfo(nvml.device_get_pci_info_ext(self._handle), self._handle) @property def serial(self) -> str: @@ -755,6 +914,130 @@ cdef class Device: bitmask[0] = nvml.device_get_supported_event_types(self._handle) return [EventType(1 << ev) for ev in _unpack_bitmask(bitmask)] + @property + def index(self) -> int: + """ + The NVML index of this device. + + Valid indices are derived from the count returned by + :meth:`Device.get_device_count`. For example, if ``get_device_count()`` + returns 2, the valid indices are 0 and 1, corresponding to GPU 0 and GPU + 1. + + The order in which NVML enumerates devices has no guarantees of + consistency between reboots. For that reason, it is recommended that + devices be looked up by their PCI ids or GPU UUID. + + Note: The NVML index may not correlate with other APIs, such as the CUDA + device index. + """ + return nvml.device_get_index(self._handle) + + @property + def module_id(self) -> int: + """ + Get a unique identifier for the device module on the baseboard. + + This API retrieves a unique identifier for each GPU module that exists + on a given baseboard. For non-baseboard products, this ID would always + be 0. + """ + return nvml.device_get_module_id(self._handle) + + @property + def minor_number(self) -> int: + """ + The minor number of this device. + + For Linux only. + + The minor number is used by the Linux device driver to identify the + device node in ``/dev/nvidiaX``. + """ + return nvml.device_get_minor_number(self._handle) + + @property + def addressing_mode(self) -> AddressingMode: + """ + Get the addressing mode of the device. + + Addressing modes can be one of: + + - :attr:`AddressingMode.DEVICE_ADDRESSING_MODE_HMM`: System allocated + memory (``malloc``, ``mmap``) is addressable from the device (GPU), via + software-based mirroring of the CPU's page tables, on the GPU. + - :attr:`AddressingMode.DEVICE_ADDRESSING_MODE_ATS`: System allocated + memory (``malloc``, ``mmap``) is addressable from the device (GPU), via + Address Translation Services. This means that there is (effectively) a + single set of page tables, and the CPU and GPU both use them. + - :attr:`AddressingMode.DEVICE_ADDRESSING_MODE_NONE`: Neither HMM nor ATS + is active. + """ + return AddressingMode(nvml.device_get_addressing_mode(self._handle).value) + + @property + def display_mode(self) -> bool: + """ + The display mode for this device. + + Indicates whether a physical display (e.g. monitor) is currently connected to + any of the device's connectors. + """ + return True if nvml.device_get_display_mode(self._handle) == nvml.EnableState.FEATURE_ENABLED else False + + @property + def display_active(self) -> bool: + """ + The display active status for this device. + + Indicates whether a display is initialized on the device. For example, + whether X Server is attached to this device and has allocated memory for + the screen. + + Display can be active even when no monitor is physically attached. + """ + return True if nvml.device_get_display_active(self._handle) == nvml.EnableState.FEATURE_ENABLED else False + + @property + def repair_status(self) -> RepairStatus: + """ + Get the repair status for TPC/Channel repair. + + For Ampere™ or newer fully supported devices. + """ + return RepairStatus(self._handle) + + @property + def inforom(self) -> InforomInfo: + """ + Accessor for InfoROM information. + + For all products with an InfoROM. + """ + return InforomInfo(self) + + def get_topology_nearest_gpus(self, level: GpuTopologyLevel) -> Iterable[Device]: + """ + Retrieve the GPUs that are nearest to this device at a specific interconnectivity level. + + Supported on Linux only. + + Parameters + ---------- + level: :class:`GpuTopologyLevel` + The topology level. + + Returns + ------- + Iterable of :class:`Device` + The nearest devices at the given topology level. + """ + cdef Device device + for handle in nvml.device_get_topology_nearest_gpus(self._handle, level): + device = Device.__new__(Device) + device._handle = handle + yield device + @property def attributes(self) -> DeviceAttributes: """ @@ -829,7 +1112,61 @@ cdef class Device: nvml.device_clear_field_values(self._handle, field_ids) +def get_topology_common_ancestor(device1: Device, device2: Device) -> GpuTopologyLevel: + """ + Retrieve the common ancestor for two devices. + + For Linux only. + + Parameters + ---------- + device1: :class:`Device` + The first device. + device2: :class:`Device` + The second device. + + Returns + ------- + :class:`GpuTopologyLevel` + The common ancestor level of the two devices. + """ + return GpuTopologyLevel( + nvml.device_get_topology_common_ancestor( + device1._handle, + device2._handle, + ) + ) + + +def get_p2p_status(device1: Device, device2: Device, index: GpuP2PCapsIndex) -> GpuP2PStatus: + """ + Retrieve the P2P status between two devices. + + Parameters + ---------- + device1: :class:`Device` + The first device. + device2: :class:`Device` + The second device. + index: :class:`GpuP2PCapsIndex` + The P2P capability index being looked for between ``device1`` and ``device2``. + + Returns + ------- + :class:`GpuP2PStatus` + The P2P status between the two devices. + """ + return GpuP2PStatus( + nvml.device_get_p2p_status( + device1._handle, + device2._handle, + index, + ) + ) + + __all__ = [ + "AddressingMode", "BAR1MemoryInfo", "BrandType", "Device", @@ -841,6 +1178,15 @@ __all__ = [ "FieldId", "FieldValue", "FieldValues", + "GpuP2PCapsIndex", + "GpuP2PStatus", + "GpuTopologyLevel", + "InforomInfo", + "InforomObject", "MemoryInfo", + "PcieUtilCounter", "PciInfo", + "RepairStatus", + "get_p2p_status", + "get_topology_common_ancestor", ] diff --git a/cuda_core/cuda/core/system/_inforom.pxi b/cuda_core/cuda/core/system/_inforom.pxi new file mode 100644 index 0000000000..c82347ee18 --- /dev/null +++ b/cuda_core/cuda/core/system/_inforom.pxi @@ -0,0 +1,104 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 + + +cdef class InforomInfo: + cdef Device _device + + def __init__(self, device: Device): + self._device = device + + def get_version(self, inforom: InforomObject) -> str: + """ + Retrieves the InfoROM version for a given InfoROM object. + + For all products with an InfoROM. + + Fermi™ and higher parts have non-volatile on-board memory for persisting + device info, such as aggregate ECC counts. + + Parameters + ---------- + inforom: :class:`InforomObject` + The InfoROM object to query. + + Returns + ------- + str + The InfoROM version. + """ + return nvml.device_get_inforom_version(self._device._handle, inforom) + + @property + def image_version(self) -> str: + """ + Retrieves the global InfoROM image version. + + For all products with an InfoROM. + + Image version just like VBIOS version uniquely describes the exact + version of the InfoROM flashed on the board in contrast to InfoROM + object version which is only an indicator of supported features. + + Returns + ------- + str + The InfoROM image version. + """ + return nvml.device_get_inforom_image_version(self._device._handle) + + @property + def configuration_checksum(self) -> int: + """ + Retrieves the checksum of the configuration stored in the device's InfoROM. + + For all products with an InfoROM. + + Can be used to make sure that two GPUs have the exact same + configuration. Current checksum takes into account configuration stored + in PWR and ECC InfoROM objects. Checksum can change between driver + releases or when user changes configuration (e.g. disable/enable ECC) + + Returns + ------- + int + The InfoROM checksum. + """ + return nvml.device_get_inforom_configuration_checksum(self._device._handle) + + def validate(self) -> None: + """ + Reads the InfoROM from the flash and verifies the checksums. + + For all products with an InfoROM. + + Raises + ------ + :class:`cuda.core.system.CorruptedInforomError` + If the device's InfoROM is corrupted. + """ + nvml.device_validate_inforom(self._device._handle) + + @property + def bbx_flush_time(self) -> tuple[int, int]: + """ + Retrieves the timestamp and duration of the last flush of the BBX + (blackbox) InfoROM object during the current run. + + For all products with an InfoROM. + + Returns + ------- + tuple[int, int] + - timestamp: The start timestamp of the last BBX flush + - duration_us: The duration (in μs) of the last BBX flush + """ + return nvml.device_get_last_bbx_flush_time(self._device._handle) + + @property + def board_part_number(self) -> str: + """ + The device board part number which is programmed into the board's InfoROM. + """ + return nvml.device_get_board_part_number(self._device._handle) diff --git a/cuda_core/docs/source/api.rst b/cuda_core/docs/source/api.rst index de5cedda19..1c10bb7298 100644 --- a/cuda_core/docs/source/api.rst +++ b/cuda_core/docs/source/api.rst @@ -79,6 +79,8 @@ CUDA system information and NVIDIA Management Library (NVML) system.get_num_devices system.get_nvml_version system.get_process_name + system.get_topology_common_ancestor + system.get_p2p_status system.register_events system.RegisteredSystemEvents @@ -89,6 +91,7 @@ CUDA system information and NVIDIA Management Library (NVML) :template: autosummary/cyclass.rst system.Device + system.AddressingMode system.BAR1MemoryInfo system.BrandType system.DeviceArchitecture @@ -99,8 +102,15 @@ CUDA system information and NVIDIA Management Library (NVML) system.FieldId system.FieldValue system.FieldValues + system.GpuP2PCapsIndex + system.GpuP2PStatus + system.GpuTopologyLevel + system.InforomInfo + system.InforomObject system.MemoryInfo + system.PcieUtilCounter system.PciInfo + system.RepairStatus .. module:: cuda.core.utils diff --git a/cuda_core/tests/system/test_system_device.py b/cuda_core/tests/system/test_system_device.py index a92684e66b..8f07b2ee27 100644 --- a/cuda_core/tests/system/test_system_device.py +++ b/cuda_core/tests/system/test_system_device.py @@ -9,6 +9,7 @@ pytestmark = skip_if_nvml_unsupported import array +import multiprocessing import os import re import sys @@ -28,6 +29,10 @@ def check_gpu_available(): pytest.skip("No GPUs available to run device tests", allow_module_level=True) +def test_device_count(): + assert system.Device.get_device_count() == system.get_num_devices() + + def test_device_architecture(): for device in system.Device.get_all_devices(): device_arch = device.architecture @@ -138,6 +143,34 @@ def test_device_pci_info(): assert isinstance(pci_info.device_id, int) assert 0x0000 <= pci_info.device_id <= 0xFFFF + assert isinstance(pci_info.subsystem_id, int) + assert 0x00000000 <= pci_info.subsystem_id <= 0xFFFFFFFF + + assert isinstance(pci_info.base_class, int) + assert 0x00 <= pci_info.base_class <= 0xFF + + assert isinstance(pci_info.sub_class, int) + assert 0x00 <= pci_info.sub_class <= 0xFF + + assert isinstance(pci_info.get_max_pcie_link_generation(), int) + assert 0 <= pci_info.get_max_pcie_link_generation() <= 0xFF + + assert isinstance(pci_info.get_gpu_max_pcie_link_generation(), int) + assert 0 <= pci_info.get_gpu_max_pcie_link_generation() <= 0xFF + + assert isinstance(pci_info.get_max_pcie_link_width(), int) + assert 0 <= pci_info.get_max_pcie_link_width() <= 0xFF + + assert isinstance(pci_info.get_current_pcie_link_generation(), int) + assert 0 <= pci_info.get_current_pcie_link_generation() <= 0xFF + + assert isinstance(pci_info.get_current_pcie_link_width(), int) + assert 0 <= pci_info.get_current_pcie_link_width() <= 0xFF + + assert isinstance(pci_info.get_pcie_throughput(system.PcieUtilCounter.PCIE_UTIL_TX_BYTES), int) + + assert isinstance(pci_info.get_pcie_replay_counter(), int) + def test_device_serial(): skip_reasons = set() @@ -350,3 +383,142 @@ def test_field_values(): if skip_reasons: pytest.skip(" ; ".join(skip_reasons)) + + +@pytest.mark.skipif(helpers.IS_WSL or helpers.IS_WINDOWS, reason="Device attributes not supported on WSL or Windows") +def test_get_all_devices_with_cpu_affinity(): + try: + for i in range(multiprocessing.cpu_count()): + for device in system.Device.get_all_devices_with_cpu_affinity(i): + affinity = device.cpu_affinity + assert isinstance(affinity, list) + assert i in affinity + except system.NotSupportedError: + pytest.skip("Getting devices with CPU affinity not supported") + + +def test_index(): + for i, device in enumerate(system.Device.get_all_devices()): + index = device.index + assert isinstance(index, int) + assert index == i + + +def test_module_id(): + for device in system.Device.get_all_devices(): + module_id = device.module_id + assert isinstance(module_id, int) + assert module_id >= 0 + + +def test_addressing_mode(): + for device in system.Device.get_all_devices(): + try: + addressing_mode = device.addressing_mode + except system.NotSupportedError: + pytest.skip(f"Device addressing mode not supported by device '{device.name}'") + continue + assert isinstance(addressing_mode, system.AddressingMode) + + +def test_display_mode(): + for device in system.Device.get_all_devices(): + display_mode = device.display_mode + assert isinstance(display_mode, bool) + + display_active = device.display_active + assert isinstance(display_active, bool) + + +def test_repair_status(): + for device in system.Device.get_all_devices(): + repair_status = device.repair_status + assert isinstance(repair_status, system.RepairStatus) + + assert isinstance(repair_status.channel_repair_pending, bool) + assert isinstance(repair_status.tpc_repair_pending, bool) + + +@pytest.mark.skipif(helpers.IS_WSL or helpers.IS_WINDOWS, reason="Device attributes not supported on WSL or Windows") +def test_get_topology_common_ancestor(): + # TODO: This is not a great test, and probably doesn't test much of anything + # in practice on our CI. + + if system.Device.get_device_count() < 2: + pytest.skip("Test requires at least 2 GPUs") + return + + devices = list(system.Device.get_all_devices()) + + ancestor = system.get_topology_common_ancestor(devices[0], devices[1]) + assert isinstance(ancestor, system.GpuTopologyLevel) + + +@pytest.mark.skipif(helpers.IS_WSL or helpers.IS_WINDOWS, reason="Device attributes not supported on WSL or Windows") +def test_get_p2p_status(): + # TODO: This is not a great test, and probably doesn't test much of anything + # in practice on our CI. + + if system.Device.get_device_count() < 2: + pytest.skip("Test requires at least 2 GPUs") + return + + devices = list(system.Device.get_all_devices()) + + status = system.get_p2p_status(devices[0], devices[1], system.GpuP2PCapsIndex.P2P_CAPS_INDEX_READ) + assert isinstance(status, system.GpuP2PStatus) + + +@pytest.mark.skipif(helpers.IS_WSL or helpers.IS_WINDOWS, reason="Device attributes not supported on WSL or Windows") +def test_get_nearest_gpus(): + # TODO: This is not a great test, and probably doesn't test much of anything + # in practice on our CI. + + for device in system.Device.get_all_devices(): + for near_device in device.get_topology_nearest_gpus(system.GpuTopologyLevel.TOPOLOGY_SINGLE): + assert isinstance(near_device, system.Device) + + +@pytest.mark.skipif(helpers.IS_WSL or helpers.IS_WINDOWS, reason="Device attributes not supported on WSL or Windows") +def test_get_minor_number(): + for device in system.Device.get_all_devices(): + minor_number = device.minor_number + assert isinstance(minor_number, int) + assert minor_number >= 0 + + +def test_get_inforom_version(): + for device in system.Device.get_all_devices(): + inforom = device.inforom + + inforom_image_version = inforom.image_version + assert isinstance(inforom_image_version, str) + assert len(inforom_image_version) > 0 + + inforom_version = inforom.get_version(system.InforomObject.INFOROM_OEM) + assert isinstance(inforom_version, str) + assert len(inforom_version) > 0 + + checksum = inforom.configuration_checksum + assert isinstance(checksum, int) + + # TODO: This is untested locally. + try: + timestamp, duration_us = inforom.bbx_flush_time + except (system.NotSupportedError, system.NotReadyError): + pass + else: + assert isinstance(timestamp, int) + assert timestamp > 0 + assert isinstance(duration_us, int) + assert duration_us > 0 + + try: + board_part_number = inforom.board_part_number + except system.NotSupportedError: + pass + else: + assert isinstance(board_part_number, str) + assert len(board_part_number) > 0 + + inforom.validate()