Post

Monitor GPUs in Home Assistant

Today I learned how to install NVapi to monitor my GPUs in Home Assistant.

NVApi is a lightweight API designed for monitoring NVIDIA GPU utilization and enabling automated power management. It provides real-time GPU metrics, supports integration with tools like Home Assistant, and offers flexible power management and PCIe link speed management based on workload and thermal conditions.

  • GPU Utilization Monitoring: Utilization, memory usage, temperature, fan speed, and power consumption.
  • Automated Power Limiting: Adjusts power limits dynamically based on temperature thresholds and total power caps, configurable per GPU or globally.
  • Cross-GPU Coordination: Total power budget applies across multiple GPUs in the same system.
  • PCIe Link Speed Management: Controls minimum and maximum PCIe link speeds with idle thresholds for power optimization.
  • Home Assistant Integration: Uses the built-in RESTful platform and template sensors.

Getting the Data

sudo apt install golang-go
git clone https://github.com/sammcj/NVApi.git
cd NVapi
go run main.go -port 9999 -rate 1
curl http://localhost:9999/gpu

Response for a single GPU:

[
  {
    "index": 0,
    "name": "NVIDIA GeForce RTX 4090",
    "gpu_utilisation": 0,
    "memory_utilisation": 0,
    "power_watts": 16,
    "power_limit_watts": 450,
    "memory_total_gb": 23.99,
    "memory_used_gb": 0.46,
    "memory_free_gb": 23.52,
    "memory_usage_percent": 2,
    "temperature": 38,
    "processes": [],
    "pcie_link_state": "not managed"
  }
]

Response for multiple GPUs:

[
  {
    "index": 0,
    "name": "NVIDIA GeForce RTX 3090",
    "gpu_utilisation": 0,
    "memory_utilisation": 0,
    "power_watts": 14,
    "power_limit_watts": 350,
    "memory_total_gb": 24,
    "memory_used_gb": 0.43,
    "memory_free_gb": 23.57,
    "memory_usage_percent": 2,
    "temperature": 36,
    "processes": [],
    "pcie_link_state": "not managed"
  },
  {
    "index": 1,
    "name": "NVIDIA RTX A4000",
    "gpu_utilisation": 0,
    "memory_utilisation": 0,
    "power_watts": 10,
    "power_limit_watts": 140,
    "memory_total_gb": 15.99,
    "memory_used_gb": 0.56,
    "memory_free_gb": 15.43,
    "memory_usage_percent": 3,
    "temperature": 41,
    "processes": [],
    "pcie_link_state": "not managed"
  }
]

Start at Boot

Create /etc/systemd/system/nvapi.service:

[Unit]
Description=Run NVapi
After=network.target

[Service]
Type=simple
Environment="GOPATH=/home/ansible/go"
WorkingDirectory=/home/ansible/NVapi
ExecStart=/usr/bin/go run main.go -port 9999 -rate 1
Restart=always
User=ansible
# Environment="GPU_TEMP_CHECK_INTERVAL=5"
# Environment="GPU_TOTAL_POWER_CAP=400"
# Environment="GPU_0_LOW_TEMP=40"
# Environment="GPU_0_MEDIUM_TEMP=70"
# Environment="GPU_0_LOW_TEMP_LIMIT=135"
# Environment="GPU_0_MEDIUM_TEMP_LIMIT=120"
# Environment="GPU_0_HIGH_TEMP_LIMIT=100"
# Environment="GPU_1_LOW_TEMP=45"
# Environment="GPU_1_MEDIUM_TEMP=75"
# Environment="GPU_1_LOW_TEMP_LIMIT=140"
# Environment="GPU_1_MEDIUM_TEMP_LIMIT=125"
# Environment="GPU_1_HIGH_TEMP_LIMIT=110"

[Install]
WantedBy=multi-user.target

Home Assistant

Add to Home Assistant configuration.yaml and restart HA (completely).

For a single GPU, this works:

sensor:
  - platform: rest
    name: MYPC GPU Information
    resource: http://mypc:9999
    method: GET
    headers:
      Content-Type: application/json
    value_template: "{{ value_json[0].index }}"
    json_attributes:
      - name
      - gpu_utilisation
      - memory_utilisation
      - power_watts
      - power_limit_watts
      - memory_total_gb
      - memory_used_gb
      - memory_free_gb
      - memory_usage_percent
      - temperature
    scan_interval: 1 # seconds

  - platform: template
    sensors:
      mypc_gpu_0_gpu:
        friendly_name: "MYPC {{ state_attr('sensor.mypc_gpu_information', 'name') }} GPU"
        value_template: "{{ state_attr('sensor.mypc_gpu_information', 'gpu_utilisation') }}"
        unit_of_measurement: "%"
      mypc_gpu_0_memory:
        friendly_name: "MYPC {{ state_attr('sensor.mypc_gpu_information', 'name') }} Memory"
        value_template: "{{ state_attr('sensor.mypc_gpu_information', 'memory_utilisation') }}"
        unit_of_measurement: "%"
      mypc_gpu_0_power:
        friendly_name: "MYPC {{ state_attr('sensor.mypc_gpu_information', 'name') }} Power"
        value_template: "{{ state_attr('sensor.mypc_gpu_information', 'power_watts') }}"
        unit_of_measurement: "W"
      mypc_gpu_0_power_limit:
        friendly_name: "MYPC {{ state_attr('sensor.mypc_gpu_information', 'name') }} Power Limit"
        value_template: "{{ state_attr('sensor.mypc_gpu_information', 'power_limit_watts') }}"
        unit_of_measurement: "W"
      mypc_gpu_0_temperature:
        friendly_name: "MYPC {{ state_attr('sensor.mypc_gpu_information', 'name') }} Temperature"
        value_template: "{{ state_attr('sensor.mypc_gpu_information', 'temperature') }}"
        unit_of_measurement: "°C"

For multiple GPUs:

rest:
    scan_interval: 1
    resource: http://mypc:9999
    sensor:
      - name: "MYPC GPU0 Information"
        value_template: "{{ value_json[0].index }}"
        json_attributes_path: "$.0"
        json_attributes:
          - name
          - gpu_utilisation
          - memory_utilisation
          - power_watts
          - power_limit_watts
          - memory_total_gb
          - memory_used_gb
          - memory_free_gb
          - memory_usage_percent
          - temperature
      - name: "MYPC GPU1 Information"
        value_template: "{{ value_json[1].index }}"
        json_attributes_path: "$.1"
        json_attributes:
          - name
          - gpu_utilisation
          - memory_utilisation
          - power_watts
          - power_limit_watts
          - memory_total_gb
          - memory_used_gb
          - memory_free_gb
          - memory_usage_percent
          - temperature

  - platform: template
    sensors:
      mypc_gpu_0_gpu:
        friendly_name: "MYPC GPU0 GPU"
        value_template: "{{ state_attr('sensor.mypc_gpu0_information', 'gpu_utilisation') }}"
        unit_of_measurement: "%"
      mypc_gpu_0_memory:
        friendly_name: "MYPC GPU0 Memory"
        value_template: "{{ state_attr('sensor.mypc_gpu0_information', 'memory_utilisation') }}"
        unit_of_measurement: "%"
      mypc_gpu_0_power:
        friendly_name: "MYPC GPU0 Power"
        value_template: "{{ state_attr('sensor.mypc_gpu0_information', 'power_watts') }}"
        unit_of_measurement: "W"
      mypc_gpu_0_power_limit:
        friendly_name: "MYPC GPU0 Power Limit"
        value_template: "{{ state_attr('sensor.mypc_gpu0_information', 'power_limit_watts') }}"
        unit_of_measurement: "W"
      mypc_gpu_0_temperature:
        friendly_name: "MYPC GPU0 Temperature"
        value_template: "{{ state_attr('sensor.mypc_gpu0_information', 'temperature') }}"
        unit_of_measurement: "C"

  - platform: template
    sensors:
      mypc_gpu_1_gpu:
        friendly_name: "MYPC GPU1 GPU"
        value_template: "{{ state_attr('sensor.mypc_gpu1_information', 'gpu_utilisation') }}"
        unit_of_measurement: "%"
      mypc_gpu_1_memory:
        friendly_name: "MYPC GPU1 Memory"
        value_template: "{{ state_attr('sensor.mypc_gpu1_information', 'memory_utilisation') }}"
        unit_of_measurement: "%"
      mypc_gpu_1_power:
        friendly_name: "MYPC GPU1 Power"
        value_template: "{{ state_attr('sensor.mypc_gpu1_information', 'power_watts') }}"
        unit_of_measurement: "W"
      mypc_gpu_1_power_limit:
        friendly_name: "MYPC GPU1 Power Limit"
        value_template: "{{ state_attr('sensor.mypc_gpu1_information', 'power_limit_watts') }}"
        unit_of_measurement: "W"
      mypc_gpu_1_temperature:
        friendly_name: "MYPC GPU1 Temperature"
        value_template: "{{ state_attr('sensor.mypc_gpu1_information', 'temperature') }}"
        unit_of_measurement: "C"

Basic entity card:

type: entities
entities:
  - entity: sensor.mypc_gpu_0_gpu
    secondary_info: last-updated
  - entity: sensor.mypc_gpu_0_memory
    secondary_info: last-updated
  - entity: sensor.mypc_gpu_0_power
    secondary_info: last-updated
  - entity: sensor.mypc_gpu_0_power_limit
    secondary_info: last-updated
  - entity: sensor.mypc_gpu_0_temperature
    secondary_info: last-updated

Ansible Role

---
- name: install go
  become: true
  package:
    name: golang-go
    state: present

- name: git clone
  git:
    repo: "https://github.com/sammcj/NVApi.git"
    dest: "/home/ansible/NVapi"
    update: yes
    force: true

# go run main.go -port 9999 -rate 1
- name: install systemd service
  become: true
  copy:
    src: nvapi.service
    dest: /etc/systemd/system/nvapi.service

- name: Reload systemd daemons, enable, and restart nvapi
  become: true
  systemd:
    name: nvapi
    daemon_reload: yes
    enabled: yes
    state: restarted
#gpu
0