apiVersion: ait.alauda.io/v1alpha2 kind: MonitorDashboard metadata: annotations: cpaas.io/dashboard.version: "5" cpaas.io/display-name: '{"zh":"HAMi GPU 监控","en":"HAMi GPU Monitor"}' labels: cpaas.io/dashboard.folder: HAMi name: hami-vgpu-metrics-dashboard namespace: cpaas-system spec: body: __elements: [] __inputs: - description: "" label: ALL name: DS_ALL pluginId: prometheus pluginName: Prometheus type: datasource __requires: - id: bargauge name: Bar gauge type: panel version: "" - id: gauge name: Gauge type: panel version: "" - id: grafana name: Grafana type: grafana version: 8.5.5 - id: graph name: Graph (old) type: panel version: "" - id: prometheus name: Prometheus type: datasource version: 1.0.0 - id: stat name: Stat type: panel version: "" - id: table name: Table type: panel version: "" annotations: list: - $$hashKey: object:192 builtIn: 1 datasource: type: datasource uid: grafana enable: true hide: true iconColor: rgba(0, 211, 255, 1) name: Annotations & Alerts target: limit: 100 matchAny: false tags: [] type: dashboard type: dashboard description: "This dashboard is gpu metrics dashboard base on NVIDIA DCGM Exporter and HAMi/k8s-vgpu-scheduler This dashboard is gpu metrics dashboard base on NVIDIA DCGM-Exporter and HAMi/k8s-vgpu-scheduler, and was modified from https://grafana.com/grafana/dashboards/21833-hami-vgpu-dashboard/ " editable: true fiscalYearStartMonth: 0 gnetId: 21833 graphTooltip: 0 id: null iteration: 1728370823317 links: [] liveNow: false panels: - datasource: type: prometheus uid: ${DS_ALL} fieldConfig: defaults: color: mode: thresholds custom: spanNulls: false mappings: [] thresholds: mode: absolute steps: - color: "#E02F44" value: 80 - color: "#56A64B" value: null overrides: [] gridPos: h: 5 w: 3 x: 0 y: 0 id: 47 options: colorMode: value graphMode: area justifyMode: auto legend: calcs: [] orientation: auto reduceOptions: calcs: - latest fields: "" values: false text: {} textMode: auto tooltip: {} pluginVersion: 8.5.5 targets: - datasource: type: prometheus uid: ${DS_ALL} editorMode: code exemplar: false expr: count(DCGM_FI_DEV_DEC_UTIL{node_name=~"$node_name",UUID=~"$uuid"}) instant: true range: false refId: A title: GPU Total type: stat - datasource: type: prometheus uid: ${DS_ALL} fieldConfig: defaults: color: mode: thresholds custom: spanNulls: false mappings: [] thresholds: mode: absolute steps: - color: "#E02F44" value: 80 - color: "#56A64B" value: null overrides: [] gridPos: h: 5 w: 2 x: 3 y: 0 id: 59 options: colorMode: value graphMode: area justifyMode: auto legend: calcs: [] orientation: auto reduceOptions: calcs: - latest fields: "" values: false text: {} textMode: auto tooltip: {} pluginVersion: 8.5.5 targets: - datasource: type: prometheus uid: ${DS_ALL} editorMode: code exemplar: false expr: sum(count_values("DCGM_FI_DEV_XID_ERRORS", DCGM_FI_DEV_XID_ERRORS{node_name=~"$node_name",UUID=~"$uuid"} > 0)) or on() vector(0) instant: true range: false refId: A title: XID Error type: stat - datasource: type: prometheus uid: ${DS_ALL} fieldConfig: defaults: color: mode: thresholds custom: spanNulls: false mappings: [] max: 100 min: 0 thresholds: mode: absolute steps: - color: "#EAB839" value: 83 - color: "#E02F44" value: 87 - color: "#56A64B" value: null unit: celsius overrides: [] gridPos: h: 5 w: 3 x: 5 y: 0 id: 55 options: legend: calcs: [] orientation: auto reduceOptions: calcs: - latest fields: "" values: false showThresholdLabels: false showThresholdMarkers: true text: {} tooltip: {} pluginVersion: 8.5.5 targets: - datasource: type: prometheus uid: ${DS_ALL} exemplar: true expr: avg(DCGM_FI_DEV_GPU_TEMP{node_name=~"$node_name",UUID=~"$uuid"}) interval: "" legendFormat: "" refId: A title: Average GPU Temp type: gauge - datasource: type: prometheus uid: ${DS_ALL} fieldConfig: defaults: color: mode: thresholds custom: spanNulls: false mappings: [] max: 2400 min: 0 thresholds: mode: absolute steps: - color: "#EAB839" value: 1800 - color: "#E02F44" value: 2200 - color: "#56A64B" value: null unit: watt overrides: [] gridPos: h: 5 w: 3 x: 8 y: 0 id: 57 links: [] options: legend: calcs: [] orientation: horizontal reduceOptions: calcs: - sum fields: "" values: false showThresholdLabels: false showThresholdMarkers: true text: {} tooltip: {} pluginVersion: 8.5.5 targets: - datasource: type: prometheus uid: ${DS_ALL} editorMode: code exemplar: false expr: sum(DCGM_FI_DEV_POWER_USAGE{node_name=~"$node_name",UUID=~"$uuid"}) instant: true interval: "" legendFormat: "" range: false refId: A title: GPU power usage type: gauge - aliasColors: {} bars: false dashLength: 10 dashes: false datasource: type: prometheus uid: ${DS_ALL} fieldConfig: defaults: custom: drawStyle: area fillOpacity: 20 spanNulls: false links: [] thresholds: steps: [] unit: celsius overrides: [] fill: 1 fillGradient: 0 gridPos: h: 5 w: 13 x: 11 y: 0 hiddenSeries: false id: 12 legend: alignAsTable: false avg: false current: true hideEmpty: false hideZero: false max: false min: false rightSide: true show: true sideWidth: 150 sort: current sortDesc: false total: false values: true lines: true linewidth: 2 nullPointMode: "null" options: alertThreshold: true legend: calcs: - latest displayMode: table placement: right showLegend: true reduceOptions: {} tooltip: sort: none percentage: false pluginVersion: 8.5.5 pointradius: 2 points: false renderer: flot seriesOverrides: [] spaceLength: 10 stack: false steppedLine: false targets: - datasource: type: prometheus uid: ${DS_ALL} exemplar: true expr: DCGM_FI_DEV_GPU_TEMP{node_name=~"$node_name",UUID=~"$uuid"} instant: false interval: "" legendFormat: "{{.node_name}} uid:{{.UUID}}" refId: A thresholds: [] timeRegions: [] title: GPU temp(DCGM) tooltip: shared: true sort: 0 value_type: individual type: timeseries xaxis: mode: time show: true values: [] yaxes: - $$hashKey: object:97 format: celsius logBase: 1 show: true - $$hashKey: object:98 format: short logBase: 1 show: true yaxis: align: false - datasource: type: prometheus uid: ${DS_ALL} fieldConfig: defaults: custom: align: auto displayMode: auto filterable: false inspect: false mappings: [] thresholds: mode: absolute steps: - color: "#E02F44" value: 80 - color: "#56A64B" value: null overrides: [] gridPos: h: 5 w: 11 x: 0 y: 5 id: 53 options: footer: fields: "" reducer: - sum show: false legend: calcs: [] reduceOptions: {} showHeader: true sortBy: - desc: true displayName: instance tooltip: {} pluginVersion: 8.5.5 targets: - datasource: type: prometheus uid: ${DS_ALL} editorMode: code exemplar: false expr: nodeGPUOverview{nodeid=~"$node_name",deviceuuid=~"$uuid"} instant: true range: false refId: A title: nodeGPUOverview transformations: - id: labelsToFields options: {} - id: merge options: {} - id: merge options: {} - id: organize options: excludeByName: Time: true Value: true branch: true container: true dc: true devicecores: true deviceidx: true devicememorylimit: false goversion: true instance: true ip: true job: true namespace: true node_name: true nodeid: false pod: true project: true revision: true sharedcontainers: false zone: true indexByName: Time: 2 Value: 3 dc: 4 devicecores: 5 deviceidx: 6 devicememorylimit: 11 devicetype: 10 deviceuuid: 12 instance: 1 ip: 9 job: 0 node_name: 8 nodeid: 7 project: 13 sharedcontainers: 14 zone: 15 renameByName: devicememorylimit: vram devicetype: "" node_name: "" sharedcontainers: "" type: table - datasource: type: prometheus uid: ${DS_ALL} fieldConfig: defaults: custom: align: auto displayMode: auto filterable: false inspect: false mappings: [] thresholds: mode: absolute steps: - color: "#E02F44" value: 80 - color: "#56A64B" value: null overrides: [] gridPos: h: 5 w: 13 x: 11 y: 5 id: 61 options: footer: fields: "" reducer: - sum show: false legend: calcs: [] reduceOptions: {} showHeader: true sortBy: - desc: true displayName: instance tooltip: {} pluginVersion: 8.5.5 targets: - datasource: type: prometheus uid: ${DS_ALL} editorMode: code exemplar: false expr: DCGM_FI_DEV_POWER_USAGE{node_name=~"$node_name", UUID=~"$uuid"} instant: true range: false refId: A title: nodeGPUList(DCGM) transformations: - id: labelsToFields options: {} - id: merge options: {} - id: merge options: {} - id: organize options: excludeByName: DCGM_FI_CUDA_DRIVER_VERSION: false DCGM_FI_DEV_BRAND: true DCGM_FI_DEV_MINOR_NUMBER: true DCGM_FI_DEV_NAME: true DCGM_FI_DEV_SERIAL: true DCGM_FI_DRIVER_VERSION: false DCGM_FI_PROCESS_NAME: true Hostname: true Time: true UUID: false Value: true branch: true container: true dc: true device: true devicecores: true deviceidx: true goversion: true gpu: true instance: true ip: true job: true modelName: false namespace: true node_name: false nodeid: false pod: true project: true revision: true sharedcontainers: false zone: true indexByName: DCGM_FI_CUDA_DRIVER_VERSION: 11 DCGM_FI_DEV_BRAND: 13 DCGM_FI_DEV_MINOR_NUMBER: 14 DCGM_FI_DEV_NAME: 15 DCGM_FI_DEV_SERIAL: 16 DCGM_FI_DRIVER_VERSION: 12 DCGM_FI_PROCESS_NAME: 17 Hostname: 6 Time: 1 UUID: 19 Value: 2 dc: 3 device: 10 gpu: 7 instance: 5 ip: 18 job: 0 modelName: 9 node_name: 8 project: 4 renameByName: devicememorylimit: vram devicetype: "" node_name: "" sharedcontainers: "" type: table - datasource: type: prometheus uid: ${DS_ALL} fieldConfig: defaults: color: mode: thresholds custom: spanNulls: false mappings: [] max: 1 min: 0 thresholds: mode: absolute steps: - color: "#56A64B" value: null overrides: [] gridPos: h: 5 w: 11 x: 0 y: 10 id: 49 interval: "" links: [] options: displayMode: basic legend: calcs: [] minVizHeight: 10 minVizWidth: 0 orientation: horizontal reduceOptions: calcs: - latest fields: "" values: false showUnfilled: true text: {} tooltip: {} pluginVersion: 8.5.5 targets: - datasource: type: prometheus uid: ${DS_ALL} exemplar: true expr: sum by (deviceuuid, nodename) (vGPUCorePercentage{nodename=~"$node_name"}) format: time_series instant: false interval: "" intervalFactor: 1 legendFormat: " {{.nodename}}:{{.deviceuuid}}" refId: A title: vGPUCorePercentage type: bargauge - aliasColors: {} bars: false dashLength: 10 dashes: false datasource: type: prometheus uid: ${DS_ALL} fieldConfig: defaults: custom: drawStyle: area fillOpacity: 20 spanNulls: false links: [] thresholds: steps: [] unit: decmbytes overrides: [] fill: 1 fillGradient: 0 gridPos: h: 8 w: 7 x: 11 y: 10 hiddenSeries: false id: 18 legend: avg: false current: true max: false min: false rightSide: false show: true total: false values: true lines: true linewidth: 2 nullPointMode: "null" options: alertThreshold: true legend: calcs: [] displayMode: list placement: bottom showLegend: true reduceOptions: {} tooltip: sort: none percentage: false pluginVersion: 8.5.5 pointradius: 2 points: false renderer: flot seriesOverrides: [] spaceLength: 10 stack: false steppedLine: false targets: - datasource: type: prometheus uid: ${DS_ALL} exemplar: true expr: DCGM_FI_DEV_FB_USED{node_name=~"$node_name", UUID=~"$uuid"} interval: "" legendFormat: "{{.node_name}} uid:{{.UUID}}" refId: A thresholds: [] timeRegions: [] title: GPU FB used(DCGM) tooltip: shared: true sort: 0 value_type: individual type: timeseries xaxis: mode: time show: true values: [] yaxes: - $$hashKey: object:618 format: decmbytes logBase: 1 show: true - $$hashKey: object:619 format: short logBase: 1 show: true yaxis: align: false - aliasColors: {} bars: false dashLength: 10 dashes: false datasource: type: prometheus uid: ${DS_ALL} fieldConfig: defaults: custom: drawStyle: area fillOpacity: 20 spanNulls: false links: [] max: "100" thresholds: steps: [] unit: percent overrides: [] fill: 1 fillGradient: 0 gridPos: h: 8 w: 6 x: 18 y: 10 hiddenSeries: false id: 6 legend: alignAsTable: false avg: false current: true max: false min: false rightSide: false show: true total: false values: true lines: true linewidth: 2 nullPointMode: "null" options: alertThreshold: true legend: calcs: [] displayMode: list placement: bottom showLegend: true reduceOptions: {} tooltip: sort: none percentage: false pluginVersion: 8.5.5 pointradius: 2 points: false renderer: flot seriesOverrides: [] spaceLength: 10 stack: false steppedLine: false targets: - datasource: type: prometheus uid: ${DS_ALL} exemplar: true expr: DCGM_FI_DEV_GPU_UTIL{node_name=~"$node_name", UUID=~"$uuid"} interval: "" legendFormat: "{{.node_name}} uid:{{.UUID}}" refId: A thresholds: [] timeRegions: [] title: GPU util(DCGM) tooltip: shared: true sort: 0 value_type: cumulative type: timeseries xaxis: mode: time show: true values: [] yaxes: - $$hashKey: object:699 format: percent logBase: 1 max: "100" min: "0" show: true - $$hashKey: object:700 format: short logBase: 1 show: true yaxis: align: false - datasource: type: prometheus uid: ${DS_ALL} fieldConfig: defaults: color: mode: palette-classic custom: spanNulls: false mappings: [] max: 1 min: 0 thresholds: mode: absolute steps: - color: "#56A64B" value: null overrides: [] gridPos: h: 5 w: 11 x: 0 y: 15 id: 51 interval: "" links: [] options: displayMode: basic legend: calcs: [] minVizHeight: 10 minVizWidth: 0 orientation: horizontal reduceOptions: calcs: - latest fields: "" values: false showUnfilled: true text: {} tooltip: {} pluginVersion: 8.5.5 targets: - datasource: type: prometheus uid: ${DS_ALL} exemplar: true expr: sum by (deviceuuid, nodename) (vGPUMemoryPercentage{nodename=~"$node_name"}) format: time_series interval: "" intervalFactor: 1 legendFormat: "{{.nodename}}:{{.deviceuuid}}" refId: A title: vGPUMemoryPercentage type: bargauge - aliasColors: {} bars: false dashLength: 10 dashes: false datasource: type: prometheus uid: ${DS_ALL} fieldConfig: defaults: custom: drawStyle: area fillOpacity: 20 spanNulls: false links: [] thresholds: steps: [] unit: watt overrides: [] fill: 1 fillGradient: 0 gridPos: h: 7 w: 7 x: 11 y: 18 hiddenSeries: false id: 10 legend: alignAsTable: false avg: false current: true max: false min: false rightSide: false show: true total: false values: true lines: true linewidth: 2 nullPointMode: "null" options: alertThreshold: true legend: calcs: [] displayMode: list placement: bottom showLegend: true reduceOptions: {} tooltip: sort: none percentage: false pluginVersion: 8.5.5 pointradius: 2 points: false renderer: flot seriesOverrides: [] spaceLength: 10 stack: false steppedLine: false targets: - datasource: type: prometheus uid: ${DS_ALL} exemplar: true expr: DCGM_FI_DEV_POWER_USAGE{node_name=~"$node_name",UUID=~"$uuid"} interval: "" legendFormat: "{{.node_name}} uid:{{.UUID}}" refId: A thresholds: [] timeRegions: [] title: GPU power usage(DCGM) tooltip: shared: true sort: 0 value_type: individual type: timeseries xaxis: mode: time show: true values: [] yaxes: - $$hashKey: object:214 format: watt logBase: 1 show: true - $$hashKey: object:215 format: short logBase: 1 show: true yaxis: align: false - aliasColors: {} bars: false dashLength: 10 dashes: false datasource: type: prometheus uid: ${DS_ALL} fieldConfig: defaults: custom: drawStyle: area fillOpacity: 20 spanNulls: false thresholds: steps: [] unit: hertz overrides: [] fill: 1 fillGradient: 0 gridPos: h: 7 w: 6 x: 18 y: 18 hiddenSeries: false id: 65 interval: "" legend: alignAsTable: false avg: false current: true max: false min: false rightSide: false show: true sideWidth: 80 total: false values: true lines: true linewidth: 2 nullPointMode: "null" options: alertThreshold: true legend: calcs: [] displayMode: list placement: bottom showLegend: true reduceOptions: {} tooltip: sort: none percentage: false pluginVersion: 8.5.5 pointradius: 2 points: false renderer: flot seriesOverrides: [] spaceLength: 10 stack: false steppedLine: false targets: - datasource: type: prometheus uid: ${DS_ALL} exemplar: true expr: DCGM_FI_DEV_SM_CLOCK{node_name=~"$node_name", UUID=~"$uuid"} * 1000000 format: time_series interval: "" intervalFactor: 1 legendFormat: "{{.node_name}} uid:{{.UUID}}" refId: A thresholds: [] timeRegions: [] title: GPU SM Clock(DCGM) tooltip: shared: true sort: 0 value_type: individual type: timeseries xaxis: mode: time show: true values: [] yaxes: - $$hashKey: object:536 format: hertz label: "" logBase: 1 show: true - $$hashKey: object:537 format: short logBase: 1 show: true yaxis: align: false - datasource: type: prometheus uid: ${DS_ALL} fieldConfig: defaults: color: mode: thresholds custom: spanNulls: false mappings: [] max: 10 min: 0 thresholds: mode: absolute steps: - color: "#E02F44" index: 1 value: 80 - color: "#56A64B" index: 0 value: null overrides: [] gridPos: h: 5 w: 11 x: 0 y: 20 id: 36 interval: "" links: [] options: displayMode: basic legend: calcs: [] minVizHeight: 10 minVizWidth: 0 orientation: horizontal reduceOptions: calcs: - latest fields: "" values: false showUnfilled: true text: {} tooltip: {} pluginVersion: 8.5.5 targets: - datasource: type: prometheus uid: ${DS_ALL} exemplar: true expr: count by (node_name) (vGPU_device_memory_usage_in_bytes{node_name=~"$node_name"}) format: time_series interval: "" intervalFactor: 1 legendFormat: "{{.node_name}}" refId: A title: vgpu used by nodes type: bargauge - aliasColors: {} bars: false dashLength: 10 dashes: false datasource: type: prometheus uid: ${DS_ALL} fieldConfig: defaults: custom: drawStyle: area fillOpacity: 20 spanNulls: false links: [] thresholds: steps: [] unit: bytes overrides: [] fill: 1 fillGradient: 0 gridPos: h: 10 w: 11 x: 0 y: 25 hiddenSeries: false id: 24 legend: alignAsTable: false avg: false current: true max: false min: false rightSide: true show: true sideWidth: 150 total: false values: true lines: true linewidth: 2 nullPointMode: "null" options: alertThreshold: true legend: calcs: - latest displayMode: table placement: right showLegend: true reduceOptions: {} tooltip: sort: none percentage: false pluginVersion: 8.5.5 pointradius: 2 points: false renderer: flot seriesOverrides: [] spaceLength: 10 stack: false steppedLine: false targets: - datasource: type: prometheus uid: ${DS_ALL} exemplar: true expr: sum by (podname) (Device_memory_desc_of_container{node_name=~"$node_name",deviceuuid=~"$uuid"}) interval: "" legendFormat: "{{.podname}}" refId: A thresholds: [] timeRegions: [] title: "HAMi-Memory desc of container " tooltip: shared: true sort: 0 value_type: individual type: timeseries xaxis: mode: time show: true values: [] yaxes: - $$hashKey: object:779 format: bytes logBase: 1 show: true - $$hashKey: object:780 format: short logBase: 1 show: true yaxis: align: false - aliasColors: {} bars: false dashLength: 10 dashes: false datasource: type: prometheus uid: ${DS_ALL} decimals: 150 fieldConfig: defaults: custom: drawStyle: area fillOpacity: 20 spanNulls: false links: [] max: "100" thresholds: steps: [] unit: percent overrides: [] fill: 1 fillGradient: 0 gridPos: h: 10 w: 13 x: 11 y: 25 hiddenSeries: false id: 38 legend: alignAsTable: false avg: false current: true max: false min: false rightSide: true show: true sideWidth: 150 total: false values: true lines: true linewidth: 2 nullPointMode: "null" options: alertThreshold: true legend: calcs: - latest displayMode: table placement: right showLegend: true reduceOptions: {} tooltip: sort: none percentage: false pluginVersion: 8.5.5 pointradius: 2 points: false renderer: flot seriesOverrides: [] spaceLength: 10 stack: false steppedLine: false targets: - datasource: type: prometheus uid: ${DS_ALL} exemplar: true expr: sum by (podname) (Device_utilization_desc_of_container{node_name=~"$node_name",deviceuuid=~"$uuid"}) interval: "" legendFormat: "{{.podname}}" refId: A thresholds: [] timeRegions: [] title: HAMi-GPU util desc of container tooltip: shared: true sort: 0 value_type: individual type: timeseries xaxis: mode: time show: true values: [] yaxes: - $$hashKey: object:1243 format: percent logBase: 1 max: "100" min: "0" show: true - $$hashKey: object:1244 format: short logBase: 1 show: true yaxis: align: false - aliasColors: {} bars: false dashLength: 10 dashes: false datasource: type: prometheus uid: ${DS_ALL} fieldConfig: defaults: custom: drawStyle: area fillOpacity: 20 spanNulls: false links: [] thresholds: steps: [] unit: bytes overrides: [] fill: 1 fillGradient: 0 gridPos: h: 10 w: 11 x: 0 y: 35 hiddenSeries: false id: 22 legend: alignAsTable: false avg: false current: true max: false min: false rightSide: true show: true sideWidth: 150 total: false values: true lines: true linewidth: 2 nullPointMode: "null" options: alertThreshold: true legend: calcs: - latest displayMode: table placement: right showLegend: true reduceOptions: {} tooltip: sort: none percentage: false pluginVersion: 8.5.5 pointradius: 2 points: false renderer: flot seriesOverrides: [] spaceLength: 10 stack: false steppedLine: false targets: - datasource: type: prometheus uid: ${DS_ALL} exemplar: true expr: HostGPUMemoryUsage{node_name=~"$node_name",deviceuuid=~"$uuid"} interval: "" legendFormat: "{{.node_name}} uid:{{.deviceuuid}}" refId: A thresholds: [] timeRegions: [] title: HAMi-Host memory usage tooltip: shared: true sort: 0 value_type: individual type: timeseries xaxis: mode: time show: true values: [] yaxes: - $$hashKey: object:1087 format: bytes logBase: 1 show: true - $$hashKey: object:1088 format: short logBase: 1 show: true yaxis: align: false - aliasColors: {} bars: false dashLength: 10 dashes: false datasource: type: prometheus uid: ${DS_ALL} fieldConfig: defaults: custom: drawStyle: area fillOpacity: 20 spanNulls: false links: [] max: "100" thresholds: steps: [] unit: percent overrides: [] fill: 1 fillGradient: 0 gridPos: h: 10 w: 13 x: 11 y: 35 hiddenSeries: false id: 20 legend: alignAsTable: false avg: false current: true max: false min: false rightSide: true show: true sideWidth: 150 total: false values: true lines: true linewidth: 2 nullPointMode: "null" options: alertThreshold: true legend: calcs: - latest displayMode: table placement: right showLegend: true reduceOptions: {} tooltip: sort: none percentage: false pluginVersion: 8.5.5 pointradius: 2 points: false renderer: flot seriesOverrides: [] spaceLength: 10 stack: false steppedLine: false targets: - datasource: type: prometheus uid: ${DS_ALL} exemplar: true expr: HostCoreUtilization{node_name=~"$node_name",deviceuuid=~"$uuid"} interval: "" legendFormat: "{{.node_name}} uid:{{.deviceuuid}}" refId: A thresholds: [] timeRegions: [] title: HAMi-Host core util tooltip: shared: true sort: 0 value_type: individual type: timeseries xaxis: mode: time show: true values: [] yaxes: - $$hashKey: object:1243 format: percent logBase: 1 max: "100" min: "0" show: true - $$hashKey: object:1244 format: short logBase: 1 show: true yaxis: align: false refresh: 5s schemaVersion: 36 style: dark tags: [] templating: list: - current: {} datasource: type: prometheus uid: ${DS_ALL} definition: label_values({__name__=~"DCGM_FI_DEV_FB_FREE|vGPU_device_memory_limit_in_bytes"}, node_name) hide: 0 includeAll: true multi: true name: node_name options: [] query: query: label_values({__name__=~"DCGM_FI_DEV_FB_FREE|vGPU_device_memory_limit_in_bytes"}, node_name) refId: StandardVariableQuery refresh: 1 regex: "" skipUrlSync: false sort: 1 tagValuesQuery: "" tagsQuery: "" type: query useTags: false - current: {} datasource: type: prometheus uid: ${DS_ALL} definition: label_values(DCGM_FI_DEV_FB_FREE{node_name=~"$node_name"},UUID) hide: 0 includeAll: true multi: true name: uuid options: [] query: query: label_values(DCGM_FI_DEV_FB_FREE{node_name=~"$node_name"},UUID) refId: StandardVariableQuery refresh: 1 regex: "" skipUrlSync: false sort: 1 tagValuesQuery: "" tagsQuery: "" type: query useTags: false time: from: now-12h to: now timepicker: refresh_intervals: - 5s - 10s - 30s - 1m - 5m - 15m - 30m - 1h - 2h - 1d timezone: browser title: hami-vgpu-metrics-dashboard uid: Oxed_c6Wz22 version: 2 weekStart: ""