feat(maitred): Update maitred - hookup to the API (#198)

## Description
We are attempting to hookup maitred to the API
Maitred duties will be:
- [ ] Hookup to the API
- [ ]  Wait for signal (from the API) to start Steam
- [ ] Stop signal to stop the gaming session, clean up Steam... and
maybe do the backup

## Summary by CodeRabbit

- **New Features**
- Introduced Docker-based deployment configurations for both the main
and relay applications.
- Added new API endpoints enabling real-time machine messaging and
enhanced IoT operations.
- Expanded database schema and actor types to support improved machine
tracking.

- **Improvements**
- Enhanced real-time communication and relay management with streamlined
room handling.
- Upgraded dependencies, logging, and error handling for greater
stability and performance.

<!-- end of auto-generated comment: release notes by coderabbit.ai -->

---------

Co-authored-by: DatCaptainHorse <DatCaptainHorse@users.noreply.github.com>
Co-authored-by: Kristian Ollikainen <14197772+DatCaptainHorse@users.noreply.github.com>
This commit is contained in:
Wanjohi
2025-04-07 23:23:53 +03:00
committed by GitHub
parent 6990494b34
commit de80f3e6ab
84 changed files with 7357 additions and 1331 deletions

View File

@@ -0,0 +1,184 @@
package system
import (
"bytes"
"errors"
"fmt"
"os"
"os/exec"
"path/filepath"
"strconv"
"strings"
)
const (
pciClassVGA = 0x0300 // VGA compatible controller
pciClass3D = 0x0302 // 3D controller
pciClassDisplay = 0x0380 // Display controller
pciClassCoProcessor = 0x0b40 // Co-processor (e.g., NVIDIA Tesla)
)
type infoPair struct {
Name string
ID int
}
type PCIInfo struct {
Slot string
Class infoPair
Vendor infoPair
Device infoPair
SVendor infoPair
SDevice infoPair
Rev string
ProgIf string
Driver string
Modules []string
IOMMUGroup string
}
const (
VendorIntel = 0x8086
VendorNVIDIA = 0x10de
VendorAMD = 0x1002
)
func GetAllGPUInfo() ([]PCIInfo, error) {
var gpus []PCIInfo
cmd := exec.Command("lspci", "-mmvvvnnkD")
output, err := cmd.Output()
if err != nil {
return nil, err
}
sections := bytes.Split(output, []byte("\n\n"))
for _, section := range sections {
var info PCIInfo
lines := bytes.Split(section, []byte("\n"))
for _, line := range lines {
parts := bytes.SplitN(line, []byte(":"), 2)
if len(parts) < 2 {
continue
}
key := strings.TrimSpace(string(parts[0]))
value := strings.TrimSpace(string(parts[1]))
switch key {
case "Slot":
info.Slot = value
case "Class":
info.Class, err = parseInfoPair(value)
case "Vendor":
info.Vendor, err = parseInfoPair(value)
case "Device":
info.Device, err = parseInfoPair(value)
case "SVendor":
info.SVendor, err = parseInfoPair(value)
case "SDevice":
info.SDevice, err = parseInfoPair(value)
case "Rev":
info.Rev = value
case "ProgIf":
info.ProgIf = value
case "Driver":
info.Driver = value
case "Module":
info.Modules = append(info.Modules, value)
case "IOMMUGroup":
info.IOMMUGroup = value
}
if err != nil {
return nil, err
}
}
// Check if this is a GPU device
if isGPUClass(info.Class.ID) {
gpus = append(gpus, info)
}
}
return gpus, nil
}
// gets infoPair from "SomeName [SomeID]"
// example: "DG2 [Arc A770] [56a0]" -> Name: "DG2 [Arc A770]", ID: "56a0"
func parseInfoPair(pair string) (infoPair, error) {
parts := strings.Split(pair, "[")
if len(parts) < 2 {
return infoPair{}, errors.New("invalid info pair")
}
id := strings.TrimSuffix(parts[len(parts)-1], "]")
name := strings.TrimSuffix(pair, "["+id)
name = strings.TrimSpace(name)
id = strings.TrimSpace(id)
// Remove ID including square brackets from name
name = strings.ReplaceAll(name, "["+id+"]", "")
name = strings.TrimSpace(name)
idHex, err := parseHexID(id)
if err != nil {
return infoPair{}, err
}
return infoPair{
Name: name,
ID: idHex,
}, nil
}
func parseHexID(id string) (int, error) {
if strings.HasPrefix(id, "0x") {
id = id[2:]
}
parsed, err := strconv.ParseInt(id, 16, 32)
if err != nil {
return 0, err
}
return int(parsed), nil
}
func isGPUClass(class int) bool {
return class == pciClassVGA || class == pciClass3D || class == pciClassDisplay || class == pciClassCoProcessor
}
// GetCardDevices returns the /dev/dri/cardX and /dev/dri/renderDXXX device
func (info PCIInfo) GetCardDevices() (cardPath, renderPath string, err error) {
busID := strings.ToLower(info.Slot)
if !strings.HasPrefix(busID, "0000:") || len(busID) != 12 || busID[4] != ':' || busID[7] != ':' || busID[10] != '.' {
return "", "", fmt.Errorf("invalid PCI Bus ID format: %s (expected 0000:XX:YY.Z)", busID)
}
byPathDir := "/dev/dri/by-path/"
entries, err := os.ReadDir(byPathDir)
if err != nil {
return "", "", fmt.Errorf("failed to read %s: %v", byPathDir, err)
}
for _, entry := range entries {
name := entry.Name()
if strings.HasPrefix(name, "pci-"+busID+"-card") {
cardPath, err = filepath.EvalSymlinks(filepath.Join(byPathDir, name))
if err != nil {
return "", "", fmt.Errorf("failed to resolve card symlink %s: %v", name, err)
}
}
if strings.HasPrefix(name, "pci-"+busID+"-render") {
renderPath, err = filepath.EvalSymlinks(filepath.Join(byPathDir, name))
if err != nil {
return "", "", fmt.Errorf("failed to resolve render symlink %s: %v", name, err)
}
}
}
if cardPath == "" && renderPath == "" {
return "", "", fmt.Errorf("no DRM devices found for PCI Bus ID: %s", busID)
}
return cardPath, renderPath, nil
}

View File

@@ -0,0 +1,290 @@
package system
import (
"bufio"
"fmt"
"log/slog"
"os"
"path/filepath"
"strconv"
"strings"
"syscall"
"time"
"unsafe"
)
// FDInfo holds parsed fdinfo data
type FDInfo struct {
ClientID string
EngineTime uint64 // i915: "drm-engine-render" in ns
Cycles uint64 // Xe: "drm-cycles-rcs"
TotalCycles uint64 // Xe: "drm-total-cycles-rcs"
MemoryVRAM uint64 // i915: "drm-memory-vram", Xe: "drm-total-vram0" in bytes
}
// findCardX maps PCI slot to /dev/dri/cardX
func findCardX(pciSlot string) (string, error) {
driPath := "/sys/class/drm"
entries, err := os.ReadDir(driPath)
if err != nil {
return "", fmt.Errorf("failed to read /sys/class/drm: %v", err)
}
for _, entry := range entries {
if strings.HasPrefix(entry.Name(), "card") {
deviceLink := filepath.Join(driPath, entry.Name(), "device")
target, err := os.Readlink(deviceLink)
if err != nil {
continue
}
if strings.Contains(target, pciSlot) {
return entry.Name(), nil
}
}
}
return "", fmt.Errorf("no cardX found for PCI slot %s", pciSlot)
}
// getDriver retrieves the driver name
func getDriver(cardX string) (string, error) {
driverLink := filepath.Join("/sys/class/drm", cardX, "device", "driver")
target, err := os.Readlink(driverLink)
if err != nil {
return "", fmt.Errorf("failed to read driver link for %s: %v", cardX, err)
}
return filepath.Base(target), nil
}
// collectFDInfo gathers fdinfo data
func collectFDInfo(cardX string) ([]FDInfo, error) {
var fdInfos []FDInfo
clientIDs := make(map[string]struct{})
procDirs, err := os.ReadDir("/proc")
if err != nil {
return nil, fmt.Errorf("failed to read /proc: %v", err)
}
for _, procDir := range procDirs {
if !procDir.IsDir() {
continue
}
pid := procDir.Name()
if _, err := strconv.Atoi(pid); err != nil {
continue
}
fdDir := filepath.Join("/proc", pid, "fd")
fdEntries, err := os.ReadDir(fdDir)
if err != nil {
continue
}
for _, fdEntry := range fdEntries {
fdPath := filepath.Join(fdDir, fdEntry.Name())
target, err := os.Readlink(fdPath)
if err != nil {
continue
}
if target == "/dev/dri/"+cardX {
fdinfoPath := filepath.Join("/proc", pid, "fdinfo", fdEntry.Name())
file, err := os.Open(fdinfoPath)
if err != nil {
continue
}
scanner := bufio.NewScanner(file)
var clientID, engineTime, cycles, totalCycles, memoryVRAM string
for scanner.Scan() {
line := scanner.Text()
parts := strings.SplitN(line, ":", 2)
if len(parts) < 2 {
continue
}
key := strings.TrimSpace(parts[0])
value := strings.TrimSpace(parts[1])
switch key {
case "drm-client-id":
clientID = value
case "drm-engine-render":
engineTime = value
case "drm-cycles-rcs":
cycles = value
case "drm-total-cycles-rcs":
totalCycles = value
case "drm-memory-vram", "drm-total-vram0": // i915 and Xe keys
memoryVRAM = value
}
}
if clientID == "" || clientID == "0" {
continue
}
if _, exists := clientIDs[clientID]; exists {
continue
}
clientIDs[clientID] = struct{}{}
fdInfo := FDInfo{ClientID: clientID}
if engineTime != "" {
fdInfo.EngineTime, _ = strconv.ParseUint(engineTime, 10, 64)
}
if cycles != "" {
fdInfo.Cycles, _ = strconv.ParseUint(cycles, 10, 64)
}
if totalCycles != "" {
fdInfo.TotalCycles, _ = strconv.ParseUint(totalCycles, 10, 64)
}
if memoryVRAM != "" {
if strings.HasSuffix(memoryVRAM, " kB") || strings.HasSuffix(memoryVRAM, " KiB") {
memKB := strings.TrimSuffix(strings.TrimSuffix(memoryVRAM, " kB"), " KiB")
if mem, err := strconv.ParseUint(memKB, 10, 64); err == nil {
fdInfo.MemoryVRAM = mem * 1024 // Convert kB to bytes
}
} else {
fdInfo.MemoryVRAM, _ = strconv.ParseUint(memoryVRAM, 10, 64) // Assume bytes if no unit
}
}
fdInfos = append(fdInfos, fdInfo)
_ = file.Close()
}
}
}
return fdInfos, nil
}
// drmIoctl wraps the syscall.Syscall for ioctl
func drmIoctl(fd int, request uintptr, data unsafe.Pointer) error {
_, _, errno := syscall.Syscall(syscall.SYS_IOCTL, uintptr(fd), request, uintptr(data))
if errno != 0 {
return fmt.Errorf("ioctl failed: %v", errno)
}
return nil
}
func monitorIntelGPU(device PCIInfo) GPUUsage {
// Map PCI slot to cardX
cardX, err := findCardX(device.Slot)
if err != nil {
slog.Warn("failed to find cardX for Intel GPU", "slot", device.Slot, "error", err)
return GPUUsage{}
}
// Determine driver
driver, err := getDriver(cardX)
if err != nil {
slog.Warn("failed to get driver", "card", cardX, "error", err)
return GPUUsage{}
}
if driver != "i915" && driver != "xe" {
slog.Warn("unsupported Intel driver", "driver", driver, "card", cardX)
return GPUUsage{}
}
// PCIInfo also has the driver, let's warn if they don't match
if device.Driver != driver {
slog.Warn("driver mismatch", "card", cardX, "lspci driver", device.Driver, "sysfs driver", driver)
}
// Open DRM device
cardPath := "/dev/dri/" + cardX
fd, err := syscall.Open(cardPath, syscall.O_RDWR, 0)
if err != nil {
slog.Error("failed to open DRM device", "path", cardPath, "error", err)
return GPUUsage{}
}
defer func(fd int) {
_ = syscall.Close(fd)
}(fd)
// Get total and used VRAM via ioctl
var totalVRAM, usedVRAMFromIOCTL uint64
if driver == "i915" {
totalVRAM, usedVRAMFromIOCTL, err = getMemoryRegionsI915(fd)
} else { // xe
totalVRAM, usedVRAMFromIOCTL, err = queryMemoryRegionsXE(fd)
}
if err != nil {
//slog.Debug("failed to get memory regions", "card", cardX, "error", err)
// Proceed with totalVRAM = 0 if ioctl fails
}
// Collect samples for usage percentage
firstFDInfos, err := collectFDInfo(cardX)
if err != nil {
slog.Warn("failed to collect first FDInfo", "card", cardX, "error", err)
return GPUUsage{}
}
time.Sleep(1 * time.Second)
secondFDInfos, err := collectFDInfo(cardX)
if err != nil {
slog.Warn("failed to collect second FDInfo", "card", cardX, "error", err)
return GPUUsage{}
}
// Calculate usage percentage
var usagePercent float64
if driver == "i915" {
var totalDeltaTime uint64
for _, second := range secondFDInfos {
for _, first := range firstFDInfos {
if second.ClientID == first.ClientID {
totalDeltaTime += second.EngineTime - first.EngineTime
break
}
}
}
if totalDeltaTime > 0 {
usagePercent = float64(totalDeltaTime) / 1e9 * 100 // ns to percent
}
} else { // xe
var totalDeltaCycles, deltaTotalCycles uint64
for i, second := range secondFDInfos {
for _, first := range firstFDInfos {
if second.ClientID == first.ClientID {
deltaCycles := second.Cycles - first.Cycles
totalDeltaCycles += deltaCycles
if i == 0 {
deltaTotalCycles = second.TotalCycles - first.TotalCycles
}
break
}
}
}
if deltaTotalCycles > 0 {
usagePercent = float64(totalDeltaCycles) / float64(deltaTotalCycles) * 100
}
}
if usagePercent > 100 {
usagePercent = 100
}
// Sum per-process VRAM usage as fallback
var usedVRAM uint64
for _, fdInfo := range secondFDInfos {
usedVRAM += fdInfo.MemoryVRAM
}
// Prefer ioctl used VRAM if available and non-zero
if usedVRAMFromIOCTL != 0 {
usedVRAM = usedVRAMFromIOCTL
}
// Compute VRAM metrics
var freeVRAM uint64
var usedPercent float64
if totalVRAM > 0 {
if usedVRAM > totalVRAM {
usedVRAM = totalVRAM
}
freeVRAM = totalVRAM - usedVRAM
usedPercent = float64(usedVRAM) / float64(totalVRAM) * 100
}
return GPUUsage{
Info: device,
UsagePercent: usagePercent,
VRAM: VRAMUsage{
Total: totalVRAM,
Used: usedVRAM,
Free: freeVRAM,
UsedPercent: usedPercent,
},
}
}

View File

@@ -0,0 +1,86 @@
package system
import (
"fmt"
"unsafe"
)
// Constants for i915
const (
DRM_COMMAND_BASE = 0x40
DRM_I915_QUERY = 0x39
DRM_IOCTL_I915_QUERY = 0x80106479 // _IOWR('d', 0x79, 16)
DRM_I915_QUERY_MEMORY_REGIONS = 4
I915_MEMORY_CLASS_DEVICE = 1
)
// drmI915QueryItem mirrors struct drm_i915_query_item
type drmI915QueryItem struct {
QueryID uintptr
Length int32
Flags uint32
DataPtr uintptr
}
// drmI915Query mirrors struct drm_i915_query
type drmI915Query struct {
NumItems uint32
Flags uint32
ItemsPtr uintptr
}
// drmI915MemoryRegionInfo mirrors struct drm_i915_memory_region_info
type drmI915MemoryRegionInfo struct {
Region struct {
MemoryClass uint16
MemoryInstance uint16
}
Rsvd0 uint32
ProbedSize uint64
UnallocatedSize uint64
Rsvd1 [8]uint64
}
func getMemoryRegionsI915(fd int) (totalVRAM, usedVRAM uint64, err error) {
// Step 1: Get the required buffer size
item := drmI915QueryItem{
QueryID: DRM_I915_QUERY_MEMORY_REGIONS,
Length: 0,
}
query := drmI915Query{
NumItems: 1,
ItemsPtr: uintptr(unsafe.Pointer(&item)),
}
if err = drmIoctl(fd, DRM_IOCTL_I915_QUERY, unsafe.Pointer(&query)); err != nil {
return 0, 0, fmt.Errorf("initial i915 query failed: %v", err)
}
if item.Length <= 0 {
return 0, 0, fmt.Errorf("i915 query returned invalid length: %d", item.Length)
}
// Step 2: Allocate buffer and perform the query
data := make([]byte, item.Length)
item.DataPtr = uintptr(unsafe.Pointer(&data[0]))
if err = drmIoctl(fd, DRM_IOCTL_I915_QUERY, unsafe.Pointer(&query)); err != nil {
return 0, 0, fmt.Errorf("second i915 query failed: %v", err)
}
// Step 3: Parse the memory regions
numRegions := *(*uint32)(unsafe.Pointer(&data[0]))
headerSize := uint32(16) // num_regions (4) + rsvd[3] (12) = 16 bytes
regionSize := uint32(88) // Size of drm_i915_memory_region_info (calculated: 4+4+8+8+64)
for i := uint32(0); i < numRegions; i++ {
offset := headerSize + i*regionSize
if offset+regionSize > uint32(len(data)) {
return 0, 0, fmt.Errorf("data buffer too small for i915 region %d", i)
}
mr := (*drmI915MemoryRegionInfo)(unsafe.Pointer(&data[offset]))
if mr.Region.MemoryClass == I915_MEMORY_CLASS_DEVICE {
totalVRAM += mr.ProbedSize
usedVRAM += mr.ProbedSize - mr.UnallocatedSize
}
}
return totalVRAM, usedVRAM, nil
}

View File

@@ -0,0 +1,84 @@
package system
import (
"fmt"
"unsafe"
)
// Constants from xe_drm.h
const (
DRM_XE_DEVICE_QUERY_MEM_REGIONS = 1
DRM_XE_MEM_REGION_CLASS_VRAM = 1
DRM_XE_DEVICE_QUERY = 0x00
DRM_IOCTL_XE_DEVICE_QUERY uintptr = 0xC0286440 // Precomputed as above
)
// drmXEDeviceQuery mirrors struct drm_xe_device_query
type drmXEDeviceQuery struct {
Extensions uint64
Query uint32
Size uint32
Data uint64
Reserved [2]uint64
}
// drmXEQueryMemRegions mirrors struct drm_xe_query_mem_regions header
type drmXEQueryMemRegions struct {
NumMemRegions uint32
Pad uint32
// mem_regions[] follows
}
// drmXEMemRegion mirrors struct drm_xe_mem_region
type drmXEMemRegion struct {
MemClass uint16
Instance uint16
MinPageSize uint32
TotalSize uint64
Used uint64
CPUVisibleSize uint64
CPUVisibleUsed uint64
Reserved [6]uint64
}
func queryMemoryRegionsXE(fd int) (totalVRAM, usedVRAM uint64, err error) {
// Step 1: Get the required size
query := drmXEDeviceQuery{
Query: DRM_XE_DEVICE_QUERY_MEM_REGIONS,
Size: 0,
}
if err = drmIoctl(fd, DRM_IOCTL_XE_DEVICE_QUERY, unsafe.Pointer(&query)); err != nil {
return 0, 0, fmt.Errorf("initial xe query failed: %v", err)
}
if query.Size == 0 {
return 0, 0, fmt.Errorf("xe query returned zero size")
}
// Step 2: Allocate buffer and perform the query
data := make([]byte, query.Size)
query.Data = uint64(uintptr(unsafe.Pointer(&data[0])))
query.Size = uint32(len(data))
if err = drmIoctl(fd, DRM_IOCTL_XE_DEVICE_QUERY, unsafe.Pointer(&query)); err != nil {
return 0, 0, fmt.Errorf("second xe query failed: %v", err)
}
// Step 3: Parse the memory regions
header := (*drmXEQueryMemRegions)(unsafe.Pointer(&data[0]))
numRegions := header.NumMemRegions
headerSize := unsafe.Sizeof(drmXEQueryMemRegions{})
regionSize := unsafe.Sizeof(drmXEMemRegion{})
for i := uint32(0); i < numRegions; i++ {
offset := headerSize + uintptr(i)*regionSize
if offset+regionSize > uintptr(len(data)) {
return 0, 0, fmt.Errorf("data buffer too small for xe region %d", i)
}
mr := (*drmXEMemRegion)(unsafe.Pointer(&data[offset]))
if mr.MemClass == DRM_XE_MEM_REGION_CLASS_VRAM {
totalVRAM += mr.TotalSize
usedVRAM += mr.Used
}
}
return totalVRAM, usedVRAM, nil
}

View File

@@ -0,0 +1,57 @@
package system
import (
"log/slog"
"os/exec"
"strconv"
"strings"
)
// monitorNVIDIAGPU monitors an NVIDIA GPU using nvidia-smi
func monitorNVIDIAGPU(device PCIInfo) GPUUsage {
// Query nvidia-smi for GPU metrics
cmd := exec.Command("nvidia-smi", "--query-gpu=pci.bus_id,utilization.gpu,memory.total,memory.used,memory.free", "--format=csv,noheader,nounits")
output, err := cmd.Output()
if err != nil {
slog.Warn("failed to run nvidia-smi", "error", err)
return GPUUsage{}
}
// Parse output and find matching GPU
lines := strings.Split(strings.TrimSpace(string(output)), "\n")
for _, line := range lines {
fields := strings.Split(line, ", ")
if len(fields) != 5 {
continue
}
busID := fields[0] // e.g., "0000:01:00.0"
if strings.Contains(busID, device.Slot) || strings.Contains(device.Slot, busID) {
usagePercent, _ := strconv.ParseFloat(fields[1], 64)
totalMiB, _ := strconv.ParseUint(fields[2], 10, 64)
usedMiB, _ := strconv.ParseUint(fields[3], 10, 64)
freeMiB, _ := strconv.ParseUint(fields[4], 10, 64)
// Convert MiB to bytes
total := totalMiB * 1024 * 1024
used := usedMiB * 1024 * 1024
free := freeMiB * 1024 * 1024
usedPercent := float64(0)
if total > 0 {
usedPercent = float64(used) / float64(total) * 100
}
return GPUUsage{
Info: device,
UsagePercent: usagePercent,
VRAM: VRAMUsage{
Total: total,
Used: used,
Free: free,
UsedPercent: usedPercent,
},
}
}
}
slog.Warn("No NVIDIA GPU found matching PCI slot", "slot", device.Slot)
return GPUUsage{}
}

View File

@@ -0,0 +1,24 @@
package system
import (
"os"
"strings"
)
const (
dbusPath = "/var/lib/dbus/machine-id"
dbusPathEtc = "/etc/machine-id"
)
// GetID returns the machine ID specified at `/var/lib/dbus/machine-id` or `/etc/machine-id`.
// If there is an error reading the files an empty string is returned.
func GetID() (string, error) {
id, err := os.ReadFile(dbusPath)
if err != nil {
id, err = os.ReadFile(dbusPathEtc)
}
if err != nil {
return "", err
}
return strings.Trim(string(id), " \n"), nil
}

View File

@@ -0,0 +1,405 @@
package system
import (
"bufio"
"bytes"
"context"
"fmt"
"log/slog"
"os"
"os/exec"
"strconv"
"strings"
"sync"
"time"
)
// CPUInfo contains CPU model information
type CPUInfo struct {
Vendor string `json:"vendor"` // CPU vendor (e.g., "AMD", "Intel")
Model string `json:"model"` // CPU model name
}
// CPUUsage contains CPU usage metrics
type CPUUsage struct {
Info CPUInfo `json:"info"` // CPU vendor and model information
Total float64 `json:"total"` // Total CPU usage in percentage (0-100)
PerCore []float64 `json:"per_core"` // CPU usage per core in percentage (0-100)
}
// MemoryUsage contains memory usage metrics
type MemoryUsage struct {
Total uint64 `json:"total"` // Total memory in bytes
Used uint64 `json:"used"` // Used memory in bytes
Available uint64 `json:"available"` // Available memory in bytes
Free uint64 `json:"free"` // Free memory in bytes
UsedPercent float64 `json:"used_percent"` // Used memory in percentage (0-100)
}
// FilesystemUsage contains usage metrics for a filesystem path
type FilesystemUsage struct {
Path string `json:"path"` // Filesystem path
Total uint64 `json:"total"` // Total disk space in bytes
Used uint64 `json:"used"` // Used disk space in bytes
Free uint64 `json:"free"` // Free disk space in bytes
UsedPercent float64 `json:"used_percent"` // Used disk space in percentage (0-100)
}
// GPUUsage contains GPU usage metrics
type GPUUsage struct {
Info PCIInfo `json:"pci_info"` // GPU PCI information
UsagePercent float64 `json:"usage_percent"` // GPU usage in percentage (0-100)
VRAM VRAMUsage `json:"vram"` // GPU memory usage metrics
}
// VRAMUsage contains GPU memory usage metrics
type VRAMUsage struct {
Total uint64 `json:"total"` // Total VRAM in bytes
Used uint64 `json:"used"` // Used VRAM in bytes
Free uint64 `json:"free"` // Free VRAM in bytes
UsedPercent float64 `json:"used_percent"` // Used VRAM in percentage (0-100)
}
// ResourceUsage contains resource usage metrics
type ResourceUsage struct {
CPU CPUUsage `json:"cpu"` // CPU usage metrics
Memory MemoryUsage `json:"memory"` // Memory usage metrics
Disk FilesystemUsage `json:"disk"` // Disk usage metrics
GPUs []GPUUsage `json:"gpus"` // Per-GPU usage metrics
}
var (
lastUsage ResourceUsage
lastUsageMutex sync.RWMutex
)
// GetSystemUsage returns last known system resource usage metrics
func GetSystemUsage() ResourceUsage {
lastUsageMutex.RLock()
defer lastUsageMutex.RUnlock()
return lastUsage
}
// StartMonitoring begins periodic system usage monitoring with the given interval
func StartMonitoring(ctx context.Context, interval time.Duration) {
slog.Info("Starting system monitoring")
go func() {
// Initial sample immediately
updateUsage()
// Ticker for periodic updates
ticker := time.NewTicker(interval)
defer ticker.Stop()
for {
select {
case <-ctx.Done():
slog.Info("Stopping system monitoring")
return
case <-ticker.C:
updateUsage()
}
}
}()
}
// updateUsage collects and updates the lastUsage variable
func updateUsage() {
// Collect CPU usage
cpu := GetCPUUsage()
// Collect memory usage
memory := GetMemoryUsage()
// Collect root filesystem usage
rootfs, err := GetFilesystemUsage("/")
if err != nil {
slog.Warn("Failed to get root filesystem usage", "error", err)
}
// Collect GPU usage
gpus := GetGPUUsage()
// Update shared variable safely
lastUsageMutex.Lock()
lastUsage = ResourceUsage{
CPU: cpu,
Memory: memory,
Disk: rootfs,
GPUs: gpus,
}
lastUsageMutex.Unlock()
}
// PrettyString returns resource usage metrics in a human-readable format string
func (r ResourceUsage) PrettyString() string {
res := "Resource Usage:\n"
res += fmt.Sprintf(" CPU:\n")
res += fmt.Sprintf(" Vendor: %s\n", r.CPU.Info.Vendor)
res += fmt.Sprintf(" Model: %s\n", r.CPU.Info.Model)
res += fmt.Sprintf(" Total Usage: %.2f%%\n", r.CPU.Total)
res += fmt.Sprintf(" Per-Core Usage:\n")
res += fmt.Sprintf(" [")
for i, coreUsage := range r.CPU.PerCore {
res += fmt.Sprintf("%.2f%%", coreUsage)
if i < len(r.CPU.PerCore)-1 {
res += ", "
}
}
res += "]\n"
res += fmt.Sprintf(" Memory:\n")
res += fmt.Sprintf(" Total: %d bytes\n", r.Memory.Total)
res += fmt.Sprintf(" Used: %d bytes\n", r.Memory.Used)
res += fmt.Sprintf(" Available: %d bytes\n", r.Memory.Available)
res += fmt.Sprintf(" Free: %d bytes\n", r.Memory.Free)
res += fmt.Sprintf(" Used Percent: %.2f%%\n", r.Memory.UsedPercent)
res += fmt.Sprintf(" Filesystem:\n")
res += fmt.Sprintf(" Path: %s\n", r.Disk.Path)
res += fmt.Sprintf(" Total: %d bytes\n", r.Disk.Total)
res += fmt.Sprintf(" Used: %d bytes\n", r.Disk.Used)
res += fmt.Sprintf(" Free: %d bytes\n", r.Disk.Free)
res += fmt.Sprintf(" Used Percent: %.2f%%\n", r.Disk.UsedPercent)
res += fmt.Sprintf(" GPUs:\n")
for i, gpu := range r.GPUs {
cardDev, renderDev, err := gpu.Info.GetCardDevices()
if err != nil {
slog.Warn("Failed to get card and render devices", "error", err)
}
res += fmt.Sprintf(" GPU %d:\n", i)
res += fmt.Sprintf(" Vendor: %s\n", gpu.Info.Vendor.Name)
res += fmt.Sprintf(" Model: %s\n", gpu.Info.Device.Name)
res += fmt.Sprintf(" Driver: %s\n", gpu.Info.Driver)
res += fmt.Sprintf(" Card Device: %s\n", cardDev)
res += fmt.Sprintf(" Render Device: %s\n", renderDev)
res += fmt.Sprintf(" Usage Percent: %.2f%%\n", gpu.UsagePercent)
res += fmt.Sprintf(" VRAM:\n")
res += fmt.Sprintf(" Total: %d bytes\n", gpu.VRAM.Total)
res += fmt.Sprintf(" Used: %d bytes\n", gpu.VRAM.Used)
res += fmt.Sprintf(" Free: %d bytes\n", gpu.VRAM.Free)
res += fmt.Sprintf(" Used Percent: %.2f%%\n", gpu.VRAM.UsedPercent)
}
return res
}
// GetCPUUsage gathers CPU usage
func GetCPUUsage() CPUUsage {
// Helper to read /proc/stat
readStat := func() (uint64, uint64, []uint64, []uint64) {
statBytes, err := os.ReadFile("/proc/stat")
if err != nil {
slog.Warn("Failed to read /proc/stat", "error", err)
return 0, 0, nil, nil
}
statScanner := bufio.NewScanner(bytes.NewReader(statBytes))
statScanner.Scan() // Total CPU line
fields := strings.Fields(statScanner.Text())[1:]
var total, idle uint64
for i, field := range fields {
val, _ := strconv.ParseUint(field, 10, 64)
total += val
if i == 3 { // Idle time
idle = val
}
}
var perCoreTotals, perCoreIdles []uint64
for statScanner.Scan() {
line := statScanner.Text()
if !strings.HasPrefix(line, "cpu") {
break
}
coreFields := strings.Fields(line)[1:]
var coreTotal, coreIdle uint64
for i, field := range coreFields {
val, _ := strconv.ParseUint(field, 10, 64)
coreTotal += val
if i == 3 { // Idle time
coreIdle = val
}
}
perCoreTotals = append(perCoreTotals, coreTotal)
perCoreIdles = append(perCoreIdles, coreIdle)
}
return total, idle, perCoreTotals, perCoreIdles
}
// First sample
prevTotal, prevIdle, prevPerCoreTotals, prevPerCoreIdles := readStat()
time.Sleep(1 * time.Second) // Delay for accurate delta
// Second sample
currTotal, currIdle, currPerCoreTotals, currPerCoreIdles := readStat()
// Calculate total CPU usage
totalDiff := float64(currTotal - prevTotal)
idleDiff := float64(currIdle - prevIdle)
var totalUsage float64
if totalDiff > 0 {
totalUsage = ((totalDiff - idleDiff) / totalDiff) * 100
}
// Calculate per-core usage
var perCore []float64
for i := range currPerCoreTotals {
coreTotalDiff := float64(currPerCoreTotals[i] - prevPerCoreTotals[i])
coreIdleDiff := float64(currPerCoreIdles[i] - prevPerCoreIdles[i])
if coreTotalDiff > 0 {
perCoreUsage := ((coreTotalDiff - coreIdleDiff) / coreTotalDiff) * 100
perCore = append(perCore, perCoreUsage)
} else {
perCore = append(perCore, 0)
}
}
// Get CPU info
cpuInfoBytes, err := os.ReadFile("/proc/cpuinfo")
if err != nil {
slog.Warn("Failed to read /proc/cpuinfo", "error", err)
return CPUUsage{}
}
cpuInfo := string(cpuInfoBytes)
scanner := bufio.NewScanner(strings.NewReader(cpuInfo))
var vendor, model string
for scanner.Scan() {
line := scanner.Text()
if strings.HasPrefix(line, "vendor_id") {
vendor = strings.TrimSpace(strings.Split(line, ":")[1])
} else if strings.HasPrefix(line, "model name") {
model = strings.TrimSpace(strings.Split(line, ":")[1])
}
if vendor != "" && model != "" {
break
}
}
return CPUUsage{
Info: CPUInfo{
Vendor: vendor,
Model: model,
},
Total: totalUsage,
PerCore: perCore,
}
}
// GetMemoryUsage gathers memory usage from /proc/meminfo
func GetMemoryUsage() MemoryUsage {
data, err := os.ReadFile("/proc/meminfo")
if err != nil {
panic(err)
}
scanner := bufio.NewScanner(bytes.NewReader(data))
var total, free, available uint64
for scanner.Scan() {
line := scanner.Text()
if strings.HasPrefix(line, "MemTotal:") {
total = parseMemInfoLine(line)
} else if strings.HasPrefix(line, "MemFree:") {
free = parseMemInfoLine(line)
} else if strings.HasPrefix(line, "MemAvailable:") {
available = parseMemInfoLine(line)
}
}
used := total - available
usedPercent := (float64(used) / float64(total)) * 100
return MemoryUsage{
Total: total * 1024, // Convert from KB to bytes
Used: used * 1024,
Available: available * 1024,
Free: free * 1024,
UsedPercent: usedPercent,
}
}
// parseMemInfoLine parses a line from /proc/meminfo
func parseMemInfoLine(line string) uint64 {
fields := strings.Fields(line)
val, _ := strconv.ParseUint(fields[1], 10, 64)
return val
}
// GetFilesystemUsage gathers usage statistics for the specified path
func GetFilesystemUsage(path string) (FilesystemUsage, error) {
cmd := exec.Command("df", path)
output, err := cmd.Output()
if err != nil {
return FilesystemUsage{}, err
}
lines := strings.Split(string(output), "\n")
if len(lines) < 2 {
return FilesystemUsage{}, fmt.Errorf("unexpected `df` output format for path: %s", path)
}
fields := strings.Fields(lines[1])
if len(fields) < 5 {
return FilesystemUsage{}, fmt.Errorf("insufficient fields in `df` output for path: %s", path)
}
total, err := strconv.ParseUint(fields[1], 10, 64)
if err != nil {
return FilesystemUsage{}, fmt.Errorf("failed to parse total space: %v", err)
}
used, err := strconv.ParseUint(fields[2], 10, 64)
if err != nil {
return FilesystemUsage{}, fmt.Errorf("failed to parse used space: %v", err)
}
free, err := strconv.ParseUint(fields[3], 10, 64)
if err != nil {
return FilesystemUsage{}, fmt.Errorf("failed to parse free space: %v", err)
}
usedPercent, err := strconv.ParseFloat(strings.TrimSuffix(fields[4], "%"), 64)
if err != nil {
return FilesystemUsage{}, fmt.Errorf("failed to parse used percentage: %v", err)
}
return FilesystemUsage{
Path: path,
Total: total * 1024,
Used: used * 1024,
Free: free * 1024,
UsedPercent: usedPercent,
}, nil
}
// GetGPUUsage gathers GPU usage for all detected GPUs
func GetGPUUsage() []GPUUsage {
var gpus []GPUUsage
// Detect all GPUs
pciInfos, err := GetAllGPUInfo()
if err != nil {
slog.Warn("Failed to get GPU info", "error", err)
return nil
}
// Monitor each GPU
for _, gpu := range pciInfos {
var gpuUsage GPUUsage
switch gpu.Vendor.ID {
case VendorIntel:
gpuUsage = monitorIntelGPU(gpu)
case VendorNVIDIA:
gpuUsage = monitorNVIDIAGPU(gpu)
case VendorAMD:
// TODO: Implement if needed
continue
default:
continue
}
gpus = append(gpus, gpuUsage)
}
return gpus
}