Skip to content

Commit 5d4fd68

Browse files
jiayu.chenjoy717
jiayu.chen
authored andcommitted
fix pod bind wrong card when there are other gpus
Signed-off-by: joy717 <joy717.xmu@gmail.com>
1 parent 729d824 commit 5d4fd68

File tree

4 files changed

+82
-3
lines changed

4 files changed

+82
-3
lines changed

internal/pkg/dcu/server.go

Lines changed: 23 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -445,6 +445,11 @@ func (p *Plugin) Allocate(ctx context.Context, reqs *kubeletdevicepluginv1beta1.
445445
nodelock.ReleaseNodeLock(nodename, NodeLockDCU)
446446
return &kubeletdevicepluginv1beta1.AllocateResponse{}, err
447447
}
448+
drms, err := util.ListDcuDrmDevices()
449+
if err != nil {
450+
util.PodAllocationFailed(nodename, current, NodeLockDCU)
451+
return &kubeletdevicepluginv1beta1.AllocateResponse{}, err
452+
}
448453
for idx := range reqs.ContainerRequests {
449454
currentCtr, devreq, err := util.GetNextDeviceRequest(util.HygonDCUDevice, *current)
450455
klog.Infoln("deviceAllocateFromAnnotation=", devreq)
@@ -479,9 +484,24 @@ func (p *Plugin) Allocate(ctx context.Context, reqs *kubeletdevicepluginv1beta1.
479484
car.Devices = append(car.Devices, dev)
480485

481486
for _, val := range devreq {
482-
var id int
487+
var devIdx = -1
483488
klog.Infof("Allocating device ID: %s", val.UUID)
484-
fmt.Sscanf(val.UUID, "DCU-%d", &id)
489+
succeedCount, err := fmt.Sscanf(val.UUID, "DCU-%d", &devIdx)
490+
if err != nil || succeedCount == 0 || devIdx == -1 {
491+
klog.Errorf("Invalid request device uuid: %s", val.UUID)
492+
util.PodAllocationFailed(nodename, current, NodeLockDCU)
493+
return &kubeletdevicepluginv1beta1.AllocateResponse{}, fmt.Errorf("invalid request device uuid %s", val.UUID)
494+
}
495+
496+
if devIdx > len(drms) {
497+
klog.Errorf("Invalid device index: %d, all devices counts are: %d", devIdx, len(drms))
498+
util.PodAllocationFailed(nodename, current, NodeLockDCU)
499+
return &kubeletdevicepluginv1beta1.AllocateResponse{}, fmt.Errorf("cant find dcu dri for %d", devIdx)
500+
}
501+
502+
drmName := drms[devIdx]
503+
var id int
504+
fmt.Sscanf(drmName, "card%d", &id)
485505

486506
devpath := fmt.Sprintf("/dev/dri/card%d", id)
487507
dev = new(kubeletdevicepluginv1beta1.DeviceSpec)
@@ -490,7 +510,7 @@ func (p *Plugin) Allocate(ctx context.Context, reqs *kubeletdevicepluginv1beta1.
490510
dev.Permissions = "rw"
491511
car.Devices = append(car.Devices, dev)
492512

493-
devpath = fmt.Sprintf("/dev/dri/renderD%d", (id + 128))
513+
devpath = fmt.Sprintf("/dev/dri/renderD%d", id+128)
494514
dev = new(kubeletdevicepluginv1beta1.DeviceSpec)
495515
dev.HostPath = devpath
496516
dev.ContainerPath = devpath

internal/pkg/util/drm_slice.go

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
package util
2+
3+
import "fmt"
4+
5+
// DrmSlice is a struct for sort to natual sorting, we assume that the element is a 'card%d' format
6+
type DrmSlice []string
7+
8+
func (p DrmSlice) Len() int { return len(p) }
9+
func (p DrmSlice) Less(i, j int) bool { return DrmSortLess(p[i], p[j]) }
10+
func (p DrmSlice) Swap(i, j int) { p[i], p[j] = p[j], p[i] }
11+
12+
func DrmSortLess(a, b string) bool {
13+
var aid, bid int
14+
fmt.Sscanf(a, "card%d", &aid)
15+
fmt.Sscanf(b, "card%d", &bid)
16+
if aid <= bid {
17+
return true
18+
}
19+
return false
20+
}

internal/pkg/util/types.go

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,12 @@ const (
4242

4343
// NodeNameEnvName define env var name for use get node name.
4444
NodeNameEnvName = "NODE_NAME"
45+
46+
//Vendor id
47+
HygonVendorID = "0x1d94"
48+
49+
//dir
50+
LocationDri = "/dev/dri"
4551
)
4652

4753
type DevicePluginConfigs struct {

internal/pkg/util/util.go

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,8 @@ import (
2222
"errors"
2323
"flag"
2424
"fmt"
25+
"os"
26+
"sort"
2527
"strconv"
2628
"strings"
2729
"time"
@@ -422,3 +424,34 @@ func PodAllocationFailed(nodeName string, pod *corev1.Pod, lockname string) {
422424
klog.Errorf("release lock failed:%v", err.Error())
423425
}
424426
}
427+
428+
// ListDcuDrmDevices list all drm devices, and filter it by dcu vendorID
429+
func ListDcuDrmDevices() ([]string, error) {
430+
filenames := make([]string, 0)
431+
dcuDrms := make([]string, 0)
432+
files, err := os.ReadDir(LocationDri)
433+
if err != nil {
434+
klog.Errorf("ListDcuDrmDevices list dri directory error: %v", err.Error())
435+
return nil, err
436+
}
437+
for _, f := range files {
438+
if !f.IsDir() && strings.HasPrefix(f.Name(), "card") {
439+
filenames = append(filenames, f.Name())
440+
}
441+
}
442+
// we need the devices order correctly, sort it first
443+
sort.Sort(DrmSlice(filenames))
444+
for _, f := range filenames {
445+
fmt.Println("files in filenames: ", f)
446+
vendorID, err := os.ReadFile(fmt.Sprintf("/sys/class/drm/%s/device/vendor", f))
447+
if err != nil {
448+
klog.Errorf("ListDcuDrmDevices read vendor file error: %v", err.Error())
449+
return nil, err
450+
}
451+
fixedVendorID := strings.TrimSpace(string(vendorID))
452+
if fixedVendorID == HygonVendorID {
453+
dcuDrms = append(dcuDrms, f)
454+
}
455+
}
456+
return dcuDrms, nil
457+
}

0 commit comments

Comments
 (0)