Skip to content

Commit b4934f6

Browse files
jiayu.chenjoy717
jiayu.chen
authored andcommitted
fix pod bind wrong card when there are other gpus
fix pod bind wrong card when there are other gpus Signed-off-by: joy717 <joy717.xmu@gmail.com>
1 parent 729d824 commit b4934f6

File tree

4 files changed

+78
-3
lines changed

4 files changed

+78
-3
lines changed

internal/pkg/dcu/server.go

Lines changed: 23 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -445,6 +445,11 @@ func (p *Plugin) Allocate(ctx context.Context, reqs *kubeletdevicepluginv1beta1.
445445
nodelock.ReleaseNodeLock(nodename, NodeLockDCU)
446446
return &kubeletdevicepluginv1beta1.AllocateResponse{}, err
447447
}
448+
drms, err := util.ListDcuDrmDevices()
449+
if err != nil {
450+
util.PodAllocationFailed(nodename, current, NodeLockDCU)
451+
return &kubeletdevicepluginv1beta1.AllocateResponse{}, err
452+
}
448453
for idx := range reqs.ContainerRequests {
449454
currentCtr, devreq, err := util.GetNextDeviceRequest(util.HygonDCUDevice, *current)
450455
klog.Infoln("deviceAllocateFromAnnotation=", devreq)
@@ -479,9 +484,24 @@ func (p *Plugin) Allocate(ctx context.Context, reqs *kubeletdevicepluginv1beta1.
479484
car.Devices = append(car.Devices, dev)
480485

481486
for _, val := range devreq {
482-
var id int
487+
var devIdx = -1
483488
klog.Infof("Allocating device ID: %s", val.UUID)
484-
fmt.Sscanf(val.UUID, "DCU-%d", &id)
489+
succeedCount, err := fmt.Sscanf(val.UUID, "DCU-%d", &devIdx)
490+
if err != nil || succeedCount == 0 || devIdx == -1 {
491+
klog.Errorf("Invalid request device uuid: %s", val.UUID)
492+
util.PodAllocationFailed(nodename, current, NodeLockDCU)
493+
return &kubeletdevicepluginv1beta1.AllocateResponse{}, fmt.Errorf("invalid request device uuid %s", val.UUID)
494+
}
495+
496+
if devIdx > len(drms) {
497+
klog.Errorf("Invalid device index: %d, all devices counts are: %d", devIdx, len(drms))
498+
util.PodAllocationFailed(nodename, current, NodeLockDCU)
499+
return &kubeletdevicepluginv1beta1.AllocateResponse{}, fmt.Errorf("cant find dcu dri for %d", devIdx)
500+
}
501+
502+
drmName := drms[devIdx]
503+
var id int
504+
fmt.Sscanf(drmName, "card%d", &id)
485505

486506
devpath := fmt.Sprintf("/dev/dri/card%d", id)
487507
dev = new(kubeletdevicepluginv1beta1.DeviceSpec)
@@ -490,7 +510,7 @@ func (p *Plugin) Allocate(ctx context.Context, reqs *kubeletdevicepluginv1beta1.
490510
dev.Permissions = "rw"
491511
car.Devices = append(car.Devices, dev)
492512

493-
devpath = fmt.Sprintf("/dev/dri/renderD%d", (id + 128))
513+
devpath = fmt.Sprintf("/dev/dri/renderD%d", id+128)
494514
dev = new(kubeletdevicepluginv1beta1.DeviceSpec)
495515
dev.HostPath = devpath
496516
dev.ContainerPath = devpath

internal/pkg/util/drm_slice.go

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
package util
2+
3+
import "fmt"
4+
5+
// DrmSlice is a struct for sort to natual sorting, we assume that the element is a 'card%d' format
6+
type DrmSlice []string
7+
8+
func (p DrmSlice) Len() int { return len(p) }
9+
func (p DrmSlice) Less(i, j int) bool { return DrmSortLess(p[i], p[j]) }
10+
func (p DrmSlice) Swap(i, j int) { p[i], p[j] = p[j], p[i] }
11+
12+
func DrmSortLess(a, b string) bool {
13+
var aid, bid int
14+
fmt.Sscanf(a, "card%d", &aid)
15+
fmt.Sscanf(b, "card%d", &bid)
16+
return aid <= bid
17+
}

internal/pkg/util/types.go

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,12 @@ const (
4242

4343
// NodeNameEnvName define env var name for use get node name.
4444
NodeNameEnvName = "NODE_NAME"
45+
46+
//HygonVendorID is the pci vendor id for hygon
47+
HygonVendorID = "0x1d94"
48+
49+
// LocationDri is the location of the dri
50+
LocationDri = "/dev/dri"
4551
)
4652

4753
type DevicePluginConfigs struct {

internal/pkg/util/util.go

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,8 @@ import (
2222
"errors"
2323
"flag"
2424
"fmt"
25+
"os"
26+
"sort"
2527
"strconv"
2628
"strings"
2729
"time"
@@ -422,3 +424,33 @@ func PodAllocationFailed(nodeName string, pod *corev1.Pod, lockname string) {
422424
klog.Errorf("release lock failed:%v", err.Error())
423425
}
424426
}
427+
428+
// ListDcuDrmDevices list all drm devices, and filter it by dcu vendorID
429+
func ListDcuDrmDevices() ([]string, error) {
430+
filenames := make([]string, 0)
431+
dcuDrms := make([]string, 0)
432+
files, err := os.ReadDir(LocationDri)
433+
if err != nil {
434+
klog.Errorf("ListDcuDrmDevices list dri directory error: %v", err.Error())
435+
return nil, err
436+
}
437+
for _, f := range files {
438+
if !f.IsDir() && strings.HasPrefix(f.Name(), "card") {
439+
filenames = append(filenames, f.Name())
440+
}
441+
}
442+
// we need the devices order correctly, sort it first
443+
sort.Sort(DrmSlice(filenames))
444+
for _, f := range filenames {
445+
vendorID, err := os.ReadFile(fmt.Sprintf("/sys/class/drm/%s/device/vendor", f))
446+
if err != nil {
447+
klog.Errorf("ListDcuDrmDevices read vendor file error: %v", err.Error())
448+
return nil, err
449+
}
450+
fixedVendorID := strings.TrimSpace(string(vendorID))
451+
if fixedVendorID == HygonVendorID {
452+
dcuDrms = append(dcuDrms, f)
453+
}
454+
}
455+
return dcuDrms, nil
456+
}

0 commit comments

Comments
 (0)