Skip to content

Commit a3fb257

Browse files
authored
Merge pull request #61 from SataQiu/fix-dynamic-mig
Fix dynamic mig feature
2 parents 730f728 + 5ae039f commit a3fb257

File tree

8 files changed

+101
-75
lines changed

8 files changed

+101
-75
lines changed

cmd/vgpu/main.go

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -104,6 +104,8 @@ func start() error {
104104
klog.Info("Starting OS watcher.")
105105
sigs := NewOSWatcher(syscall.SIGHUP, syscall.SIGINT, syscall.SIGTERM, syscall.SIGQUIT)
106106

107+
nvidiaCfg := util.LoadNvidiaConfig()
108+
107109
cache := nvidiadevice.NewDeviceCache()
108110
cache.Start()
109111
defer cache.Stop()
@@ -124,7 +126,7 @@ restart:
124126
if err != nil {
125127
return fmt.Errorf("error creating MIG strategy: %v", err)
126128
}
127-
plugins = migStrategy.GetPlugins(cache)
129+
plugins = migStrategy.GetPlugins(nvidiaCfg, cache)
128130

129131
started := 0
130132
pluginStartError := make(chan struct{})

pkg/plugin/vgpu/cache.go

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -20,10 +20,11 @@ import (
2020
"sync"
2121

2222
pluginapi "k8s.io/kubelet/pkg/apis/deviceplugin/v1beta1"
23+
"volcano.sh/k8s-device-plugin/pkg/plugin/vgpu/config"
2324
)
2425

2526
type DeviceCache struct {
26-
GpuDeviceManager
27+
*GpuDeviceManager
2728

2829
cache []*Device
2930
stopCh chan interface{}
@@ -33,8 +34,12 @@ type DeviceCache struct {
3334
}
3435

3536
func NewDeviceCache() *DeviceCache {
37+
skipMigEnabledGPUs := true
38+
if config.Mode == "mig" {
39+
skipMigEnabledGPUs = false
40+
}
3641
return &DeviceCache{
37-
GpuDeviceManager: GpuDeviceManager{true},
42+
GpuDeviceManager: NewGpuDeviceManager(skipMigEnabledGPUs),
3843
stopCh: make(chan interface{}),
3944
unhealthy: make(chan *Device),
4045
notifyCh: make(map[string]chan *Device),

pkg/plugin/vgpu/config/config.go

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,11 @@ var (
4848
globalDevice device.Interface
4949
)
5050

51+
var (
52+
// DevicePluginFilterDevice need device-plugin filter this device, don't register this device.
53+
DevicePluginFilterDevice *FilterDevice
54+
)
55+
5156
func Nvml() nvml.Interface {
5257
return nvmllib
5358
}

pkg/plugin/vgpu/mig-strategy.go

Lines changed: 12 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,7 @@ type MigStrategyResourceSet map[string]struct{}
3939

4040
// MigStrategy provides an interface for building the set of plugins required to implement a given MIG strategy
4141
type MigStrategy interface {
42-
GetPlugins(cache *DeviceCache) []*NvidiaDevicePlugin
42+
GetPlugins(cfg *config.NvidiaConfig, cache *DeviceCache) []*NvidiaDevicePlugin
4343
MatchesResource(mig *nvml.Device, resource string) bool
4444
}
4545

@@ -61,24 +61,27 @@ type migStrategySingle struct{}
6161
type migStrategyMixed struct{}
6262

6363
// migStrategyNone
64-
func (s *migStrategyNone) GetPlugins(cache *DeviceCache) []*NvidiaDevicePlugin {
64+
func (s *migStrategyNone) GetPlugins(cfg *config.NvidiaConfig, cache *DeviceCache) []*NvidiaDevicePlugin {
6565
return []*NvidiaDevicePlugin{
6666
NewNvidiaDevicePlugin(
6767
//"nvidia.com/gpu",
6868
util.ResourceName,
6969
cache,
7070
gpuallocator.NewBestEffortPolicy(),
71-
pluginapi.DevicePluginPath+"nvidia-gpu.sock"),
71+
pluginapi.DevicePluginPath+"nvidia-gpu.sock",
72+
cfg),
7273
NewNvidiaDevicePlugin(
7374
util.ResourceMem,
7475
cache,
7576
gpuallocator.NewBestEffortPolicy(),
76-
pluginapi.DevicePluginPath+"nvidia-gpu-memory.sock"),
77+
pluginapi.DevicePluginPath+"nvidia-gpu-memory.sock",
78+
cfg),
7779
NewNvidiaDevicePlugin(
7880
util.ResourceCores,
7981
cache,
8082
gpuallocator.NewBestEffortPolicy(),
81-
pluginapi.DevicePluginPath+"nvidia-gpu-cores.sock"),
83+
pluginapi.DevicePluginPath+"nvidia-gpu-cores.sock",
84+
cfg),
8285
}
8386
}
8487

@@ -87,7 +90,7 @@ func (s *migStrategyNone) MatchesResource(mig *nvml.Device, resource string) boo
8790
}
8891

8992
// migStrategySingle
90-
func (s *migStrategySingle) GetPlugins(cache *DeviceCache) []*NvidiaDevicePlugin {
93+
func (s *migStrategySingle) GetPlugins(cfg *config.NvidiaConfig, cache *DeviceCache) []*NvidiaDevicePlugin {
9194
panic("single mode in MIG currently not supported")
9295
}
9396

@@ -96,7 +99,7 @@ func (s *migStrategySingle) MatchesResource(mig *nvml.Device, resource string) b
9699
}
97100

98101
// migStrategyMixed
99-
func (s *migStrategyMixed) GetPlugins(cache *DeviceCache) []*NvidiaDevicePlugin {
102+
func (s *migStrategyMixed) GetPlugins(cfg *config.NvidiaConfig, cache *DeviceCache) []*NvidiaDevicePlugin {
100103
devices := NewMIGCapableDevices()
101104

102105
if err := devices.AssertAllMigEnabledDevicesAreValid(); err != nil {
@@ -128,7 +131,8 @@ func (s *migStrategyMixed) GetPlugins(cache *DeviceCache) []*NvidiaDevicePlugin
128131
util.ResourceName,
129132
cache,
130133
gpuallocator.NewBestEffortPolicy(),
131-
pluginapi.DevicePluginPath+"nvidia-gpu.sock"),
134+
pluginapi.DevicePluginPath+"nvidia-gpu.sock",
135+
cfg),
132136
}
133137

134138
for resource := range resources {

pkg/plugin/vgpu/nvidia.go

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -109,6 +109,19 @@ func (g *GpuDeviceManager) Devices() []*Device {
109109
continue
110110
}
111111

112+
// Auto ebale MIG mode when the plugin is running in MIG mode
113+
if config.Mode == "mig" && migMode != nvml.DEVICE_MIG_ENABLE {
114+
if ret == nvml.ERROR_NOT_SUPPORTED {
115+
klog.V(4).Infof("Node is configed as MIG mode, but GPU %v does not support MIG mode", i)
116+
continue
117+
}
118+
ret, stat := d.SetMigMode(nvml.DEVICE_MIG_ENABLE)
119+
if ret != nvml.SUCCESS || stat != nvml.SUCCESS {
120+
klog.V(4).Infof("Node is configed as MIG mode, but failed to enable MIG mode for GPU %v : ret=%v, stat=%v", i, ret, stat)
121+
continue
122+
}
123+
}
124+
112125
dev, err := buildDevice(fmt.Sprintf("%v", i), d)
113126
if err != nil {
114127
log.Panicln("Fatal:", err)

pkg/plugin/vgpu/plugin.go

Lines changed: 4 additions & 63 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,6 @@ package vgpu
1818

1919
import (
2020
"bytes"
21-
"encoding/json"
2221
"errors"
2322
"fmt"
2423
"log"
@@ -68,7 +67,7 @@ type NvidiaDevicePlugin struct {
6867
deviceListEnvvar string
6968
allocatePolicy gpuallocator.Policy
7069
socket string
71-
schedulerConfig config.NvidiaConfig
70+
schedulerConfig *config.NvidiaConfig
7271
operatingMode string
7372

7473
virtualDevices []*pluginapi.Device
@@ -82,74 +81,16 @@ type NvidiaDevicePlugin struct {
8281
migStrategy string
8382
}
8483

85-
var (
86-
// DevicePluginFilterDevice need device-plugin filter this device, don't register this device.
87-
DevicePluginFilterDevice *config.FilterDevice
88-
)
89-
90-
func readFromConfigFile(sConfig *config.NvidiaConfig) (string, error) {
91-
config.Mode = "hami-core"
92-
jsonbyte, err := os.ReadFile("/config/config.json")
93-
if err != nil {
94-
return "", err
95-
}
96-
var deviceConfigs config.DevicePluginConfigs
97-
err = json.Unmarshal(jsonbyte, &deviceConfigs)
98-
if err != nil {
99-
return "", err
100-
}
101-
klog.Infof("Device Plugin Configs: %v", fmt.Sprintf("%v", deviceConfigs))
102-
for _, val := range deviceConfigs.Nodeconfig {
103-
if os.Getenv("NODE_NAME") == val.Name {
104-
klog.Infof("Reading config from file %s", val.Name)
105-
if val.Devicememoryscaling > 0 {
106-
sConfig.DeviceMemoryScaling = val.Devicememoryscaling
107-
}
108-
if val.Devicecorescaling > 0 {
109-
sConfig.DeviceCoreScaling = val.Devicecorescaling
110-
}
111-
if val.Devicesplitcount > 0 {
112-
sConfig.DeviceSplitCount = val.Devicesplitcount
113-
}
114-
if val.FilterDevice != nil && (len(val.FilterDevice.UUID) > 0 || len(val.FilterDevice.Index) > 0) {
115-
DevicePluginFilterDevice = val.FilterDevice
116-
}
117-
if len(val.OperatingMode) > 0 {
118-
config.Mode = val.OperatingMode
119-
}
120-
klog.Infof("FilterDevice: %v", val.FilterDevice)
121-
}
122-
}
123-
return config.Mode, nil
124-
}
125-
12684
// NewNvidiaDevicePlugin returns an initialized NvidiaDevicePlugin
127-
func NewNvidiaDevicePlugin(resourceName string, deviceCache *DeviceCache, allocatePolicy gpuallocator.Policy, socket string) *NvidiaDevicePlugin {
128-
configs, err := util.LoadConfigFromCM("volcano-vgpu-device-config")
129-
if err != nil {
130-
klog.InfoS("configMap not found", err.Error())
131-
}
132-
nvidiaConfig := config.NvidiaConfig{}
133-
if configs != nil {
134-
nvidiaConfig = configs.NvidiaConfig
135-
}
136-
nvidiaConfig.DeviceSplitCount = config.DeviceSplitCount
137-
nvidiaConfig.DeviceCoreScaling = config.DeviceCoresScaling
138-
nvidiaConfig.GPUMemoryFactor = config.GPUMemoryFactor
139-
mode, err := readFromConfigFile(&nvidiaConfig)
140-
if err != nil {
141-
klog.InfoS("readFrom device cm error", err.Error())
142-
return nil
143-
}
144-
klog.Infoln("Loaded config=", nvidiaConfig)
85+
func NewNvidiaDevicePlugin(resourceName string, deviceCache *DeviceCache, allocatePolicy gpuallocator.Policy, socket string, cfg *config.NvidiaConfig) *NvidiaDevicePlugin {
14586
dp := &NvidiaDevicePlugin{
14687
deviceCache: deviceCache,
14788
resourceName: resourceName,
14889
allocatePolicy: allocatePolicy,
14990
socket: socket,
15091
migStrategy: "none",
151-
operatingMode: mode,
152-
schedulerConfig: nvidiaConfig,
92+
operatingMode: config.Mode,
93+
schedulerConfig: cfg,
15394
// These will be reinitialized every
15495
// time the plugin server is restarted.
15596
server: nil,

pkg/plugin/vgpu/util/util.go

Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -541,3 +541,58 @@ func ExtractMigTemplatesFromUUID(uuid string) (string, int, error) {
541541

542542
return templateGroupName, pos, nil
543543
}
544+
545+
func LoadNvidiaConfig() *config.NvidiaConfig {
546+
configs, err := LoadConfigFromCM("volcano-vgpu-device-config")
547+
if err != nil {
548+
klog.InfoS("configMap not found", err.Error())
549+
}
550+
nvidiaConfig := config.NvidiaConfig{}
551+
if configs != nil {
552+
nvidiaConfig = configs.NvidiaConfig
553+
}
554+
nvidiaConfig.DeviceSplitCount = config.DeviceSplitCount
555+
nvidiaConfig.DeviceCoreScaling = config.DeviceCoresScaling
556+
nvidiaConfig.GPUMemoryFactor = config.GPUMemoryFactor
557+
if err := readFromConfigFile(&nvidiaConfig); err != nil {
558+
klog.InfoS("readFrom device cm error", err.Error())
559+
}
560+
klog.Infoln("Loaded config=", nvidiaConfig)
561+
return &nvidiaConfig
562+
}
563+
564+
func readFromConfigFile(sConfig *config.NvidiaConfig) error {
565+
config.Mode = "hami-core"
566+
jsonbyte, err := os.ReadFile("/config/config.json")
567+
if err != nil {
568+
return err
569+
}
570+
var deviceConfigs config.DevicePluginConfigs
571+
err = json.Unmarshal(jsonbyte, &deviceConfigs)
572+
if err != nil {
573+
return err
574+
}
575+
klog.Infof("Device Plugin Configs: %v", fmt.Sprintf("%v", deviceConfigs))
576+
for _, val := range deviceConfigs.Nodeconfig {
577+
if os.Getenv("NODE_NAME") == val.Name {
578+
klog.Infof("Reading config from file %s", val.Name)
579+
if val.Devicememoryscaling > 0 {
580+
sConfig.DeviceMemoryScaling = val.Devicememoryscaling
581+
}
582+
if val.Devicecorescaling > 0 {
583+
sConfig.DeviceCoreScaling = val.Devicecorescaling
584+
}
585+
if val.Devicesplitcount > 0 {
586+
sConfig.DeviceSplitCount = val.Devicesplitcount
587+
}
588+
if val.FilterDevice != nil && (len(val.FilterDevice.UUID) > 0 || len(val.FilterDevice.Index) > 0) {
589+
config.DevicePluginFilterDevice = val.FilterDevice
590+
}
591+
if len(val.OperatingMode) > 0 {
592+
config.Mode = val.OperatingMode
593+
}
594+
klog.Infof("FilterDevice: %v", val.FilterDevice)
595+
}
596+
}
597+
return nil
598+
}

volcano-vgpu-device-plugin.yml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -246,7 +246,8 @@ spec:
246246
fieldRef:
247247
fieldPath: spec.nodeName
248248
securityContext:
249-
allowPrivilegeEscalation: false
249+
privileged: true
250+
allowPrivilegeEscalation: true
250251
capabilities:
251252
drop: ["ALL"]
252253
add: ["SYS_ADMIN"]

0 commit comments

Comments
 (0)