Skip to content

Commit e4e6077

Browse files
authored
Merge pull request #296 from BigVan/prefetch_conf
[Feat.] Support 'prioritize files' as a new prefetch mode
2 parents e4abacf + fb61804 commit e4e6077

File tree

4 files changed

+145
-81
lines changed

4 files changed

+145
-81
lines changed

README.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -72,6 +72,8 @@ Accelerated Container Image is a __non-core__ sub-project of containerd.
7272

7373
* See the [PERFORMANCE](docs/PERFORMANCE.md) test about the acceleration.
7474

75+
* Enable 'record-trace' function can achieve higher performance for the entrypoint that needs to read amount of data at container startup. See [ENABLE_TRACE](docs/trace-prefetch.md).
76+
7577
* See how to convert OCI image into overlaybd with specified file system at [MULTI_FS_SUPPORT](docs/MULTI_FS_SUPPORT.md).
7678

7779
* See how to use layer deduplication for image conversion at [IMAGE_CONVERTOR](docs/IMAGE_CONVERTOR.md).

cmd/ctr/record_trace.go

Lines changed: 101 additions & 73 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ import (
2323
"encoding/hex"
2424
"encoding/json"
2525
"fmt"
26+
"io"
2627
"os"
2728
"os/exec"
2829
"os/signal"
@@ -110,6 +111,11 @@ var recordTraceCommand = &cli.Command{
110111
Usage: "record time in seconds. When time expires, a TERM signal will be sent to the task. The task might fail to respond signal if time is too short.",
111112
Value: 60,
112113
},
114+
&cli.StringFlag{
115+
Name: "priority_list",
116+
Usage: "path of a file-list contains files to be prefetched",
117+
Value: "",
118+
},
113119
&cli.StringFlag{
114120
Name: "working-dir",
115121
Value: "/tmp/ctr-record-trace/",
@@ -140,7 +146,12 @@ var recordTraceCommand = &cli.Command{
140146
Value: "/opt/cni/bin/",
141147
},
142148
},
143-
149+
Before: func(cliCtx *cli.Context) error {
150+
if cliCtx.IsSet("priority_list") && cliCtx.Args().Len() > 2 {
151+
return errors.New("command args and priority_list can't be set at the same time")
152+
}
153+
return nil
154+
},
144155
Action: func(cliCtx *cli.Context) (err error) {
145156
recordTime := time.Duration(cliCtx.Uint("time")) * time.Second
146157
if recordTime == 0 {
@@ -206,89 +217,105 @@ var recordTraceCommand = &cli.Command{
206217
if traceFd, err = os.Create(traceFile); err != nil {
207218
return errors.New("failed to create trace file")
208219
}
209-
_ = traceFd.Close()
210220
defer os.Remove(traceFile)
211-
212-
// Create lease
213-
ctx, deleteLease, err := client.WithLease(ctx,
214-
leases.WithID(uniqueObjectString()),
215-
leases.WithExpiration(maxLeaseTime),
216-
)
217-
if err != nil {
218-
return errors.Wrap(err, "failed to create lease")
219-
}
220-
defer deleteLease(ctx)
221-
222-
// Create isolated network
223-
if !cliCtx.Bool("disable-network-isolation") {
224-
networkNamespace = uniqueObjectString()
225-
namespacePath = "/var/run/netns/" + networkNamespace
226-
if err = exec.Command("ip", "netns", "add", networkNamespace).Run(); err != nil {
227-
return errors.Wrapf(err, "failed to add netns")
228-
}
229-
defer func() {
230-
if nextErr := exec.Command("ip", "netns", "delete", networkNamespace).Run(); err == nil && nextErr != nil {
231-
err = errors.Wrapf(err, "failed to delete netns")
232-
}
233-
}()
234-
cniObj, err := createIsolatedNetwork(cliCtx)
221+
if !cliCtx.IsSet("priority_list") {
222+
_ = traceFd.Close()
223+
224+
// Create lease
225+
ctx, deleteLease, err := client.WithLease(ctx,
226+
leases.WithID(uniqueObjectString()),
227+
leases.WithExpiration(maxLeaseTime),
228+
)
235229
if err != nil {
236-
return err
230+
return errors.Wrap(err, "failed to create lease")
237231
}
238-
defer func() {
239-
if nextErr := cniObj.Remove(ctx, networkNamespace, namespacePath); err == nil && nextErr != nil {
240-
err = errors.Wrapf(nextErr, "failed to teardown network")
232+
defer deleteLease(ctx)
233+
234+
// Create isolated network
235+
if !cliCtx.Bool("disable-network-isolation") {
236+
networkNamespace = uniqueObjectString()
237+
namespacePath = "/var/run/netns/" + networkNamespace
238+
if err = exec.Command("ip", "netns", "add", networkNamespace).Run(); err != nil {
239+
return errors.Wrapf(err, "failed to add netns")
240+
}
241+
defer func() {
242+
if nextErr := exec.Command("ip", "netns", "delete", networkNamespace).Run(); err == nil && nextErr != nil {
243+
err = errors.Wrapf(err, "failed to delete netns")
244+
}
245+
}()
246+
cniObj, err := createIsolatedNetwork(cliCtx)
247+
if err != nil {
248+
return err
249+
}
250+
defer func() {
251+
if nextErr := cniObj.Remove(ctx, networkNamespace, namespacePath); err == nil && nextErr != nil {
252+
err = errors.Wrapf(nextErr, "failed to teardown network")
253+
}
254+
}()
255+
if _, err = cniObj.Setup(ctx, networkNamespace, namespacePath); err != nil {
256+
return errors.Wrapf(err, "failed to setup network for namespace")
241257
}
242-
}()
243-
if _, err = cniObj.Setup(ctx, networkNamespace, namespacePath); err != nil {
244-
return errors.Wrapf(err, "failed to setup network for namespace")
245258
}
246-
}
247259

248-
// Create container and run task
249-
fmt.Println("Create container")
250-
container, err := createContainer(ctx, client, cliCtx, image, traceFile)
251-
if err != nil {
252-
return err
253-
}
254-
defer container.Delete(ctx, containerd.WithSnapshotCleanup)
260+
// Create container and run task
261+
fmt.Println("Create container")
262+
container, err := createContainer(ctx, client, cliCtx, image, traceFile)
263+
if err != nil {
264+
return err
265+
}
266+
defer container.Delete(ctx, containerd.WithSnapshotCleanup)
255267

256-
task, err := tasks.NewTask(ctx, client, container, "", nil, false, "", nil)
257-
if err != nil {
258-
return err
259-
}
260-
defer task.Delete(ctx)
268+
task, err := tasks.NewTask(ctx, client, container, "", nil, false, "", nil)
269+
if err != nil {
270+
return err
271+
}
272+
defer task.Delete(ctx)
261273

262-
var statusC <-chan containerd.ExitStatus
263-
if statusC, err = task.Wait(ctx); err != nil {
264-
return err
265-
}
274+
var statusC <-chan containerd.ExitStatus
275+
if statusC, err = task.Wait(ctx); err != nil {
276+
return err
277+
}
266278

267-
if err := task.Start(ctx); err != nil {
268-
return err
269-
}
270-
fmt.Println("Task is running ...")
279+
if err := task.Start(ctx); err != nil {
280+
return err
281+
}
282+
fmt.Println("Task is running ...")
271283

272-
timer := time.NewTimer(recordTime)
273-
watchStop := make(chan bool)
284+
timer := time.NewTimer(recordTime)
285+
watchStop := make(chan bool)
274286

275-
// Start a thread to watch timeout and signals
276-
go watchThread(ctx, timer, task, watchStop)
287+
// Start a thread to watch timeout and signals
288+
go watchThread(ctx, timer, task, watchStop)
277289

278-
// Wait task stopped
279-
status := <-statusC
280-
if _, _, err := status.Result(); err != nil {
281-
return errors.Wrapf(err, "failed to get exit status")
282-
}
290+
// Wait task stopped
291+
status := <-statusC
292+
if _, _, err := status.Result(); err != nil {
293+
return errors.Wrapf(err, "failed to get exit status")
294+
}
283295

284-
if timer.Stop() {
285-
watchStop <- true
286-
fmt.Println("Task finished before timeout ...")
287-
}
296+
if timer.Stop() {
297+
watchStop <- true
298+
fmt.Println("Task finished before timeout ...")
299+
}
288300

289-
// Collect trace
290-
if err = collectTrace(traceFile); err != nil {
291-
return err
301+
// Collect trace
302+
if err = collectTrace(traceFile); err != nil {
303+
return err
304+
}
305+
} else {
306+
fmt.Println("Set priority list as acceleration layer")
307+
defer traceFd.Close()
308+
fn := cliCtx.String("priority_list")
309+
inf, err := os.OpenFile(fn, os.O_RDONLY, 0644)
310+
if err != nil {
311+
fmt.Printf("failed to open priority list: %s", err.Error())
312+
return err
313+
}
314+
defer inf.Close()
315+
_, err = io.Copy(traceFd, inf)
316+
if err != nil {
317+
return err
318+
}
292319
}
293320

294321
// Load trace file into content, and generate an acceleration layer
@@ -455,22 +482,23 @@ func createImageWithAccelLayer(ctx context.Context, cs content.Store, oldManifes
455482
newManifest.Config = newConfigDesc
456483
newManifest.Layers = append(oldManifest.Layers, l.Desc)
457484

485+
imageMediaType := oldManifest.MediaType
486+
458487
// V2 manifest is not adopted in OCI spec yet, so follow the docker registry V2 spec here
459488
var newManifestV2 = struct {
460489
ocispec.Manifest
461490
MediaType string `json:"mediaType"`
462491
}{
463492
Manifest: newManifest,
464-
MediaType: images.MediaTypeDockerSchema2Manifest,
493+
MediaType: imageMediaType, //images.MediaTypeDockerSchema2Manifest,
465494
}
466495

467496
newManifestData, err := json.MarshalIndent(newManifestV2, "", " ")
468497
if err != nil {
469498
return emptyDesc, err
470499
}
471-
472500
newManifestDesc := ocispec.Descriptor{
473-
MediaType: images.MediaTypeDockerSchema2Manifest,
501+
MediaType: imageMediaType, // images.MediaTypeDockerSchema2Manifest,
474502
Digest: digest.Canonical.FromBytes(newManifestData),
475503
Size: int64(len(newManifestData)),
476504
}

docs/trace-prefetch.md

Lines changed: 34 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -8,15 +8,38 @@ There are many ways to do prefetch, for instance, we can simply read extra data
88

99
Another way is to [prioritize files and use landmarks](https://github.com/containerd/stargz-snapshotter/blob/master/docs/stargz-estargz.md#prioritized-files-and-landmark-files), which is already adopted in Google's stargz. The storage engine runtime will prefetch the range where prioritized files are contained. And finally this information will be leveraged for increasing cache hit ratio and mitigating read overhead.
1010

11-
In this article we are about to introduce a new prefetch mechanism based on time sequenced I/O patterns (trace). This mechanism has been integrated as a feature into `ctr record-trace` command.
11+
In this article we are about to introduce two prefetch modes in overlayBD. One is to set prioritized files, another is a new prefetch mechanism based on time sequenced I/O patterns (trace).
12+
These two mechanisms have been integrated as a feature into `ctr record-trace` command.
1213

13-
## Trace Prefetch
14+
## Prefetch Mode
15+
16+
### Prioritize Files
17+
18+
Setting prioritized files is a simple way to improve container's cold start time. It is suitable for the condition where the target files needed be fully loaded.
19+
20+
When overlaybd device has been created, it will get prioritized files from the priority_list and analyze the filesystem via libext4 before mounting, then download the target files to overalybd's cache.
21+
22+
**Only support images based on EXT4 filesystem**
23+
24+
The priority list is a simple text file, each line contains a file path like follow:
25+
```bash
26+
## cat /tmp/priority_list.txt
27+
/usr/bin/containerd
28+
/usr/bin/nerdctl
29+
/opt/cni/dhcp
30+
/opt/cni/vlan
31+
```
32+
33+
34+
### Trace Prefetch
1435

1536
Since every single I/O request happens on user's own filesystem will eventually be mapped into one overlaybd's layer blob, we can then record all I/Os from the layer blob's perspective, and replay them later. That's why we call it Trace Prefetch.
1637

1738
Trace prefetch is time based, and it has greater granularity and predication accuracy than stargz. We don't mark a file, because user app might only need to read a small part of it in the beginning, simply prefetching the whole file would be less efficient. Instead, we replay the trace, by the exact I/O records that happened before. Each record contains only necessary information, such as the offset and length of the blob being read.
1839

19-
Trace is stored as an independent image layer, and MUST always be the uppermost one. Neither image manifest nor container snapshotter needs to know if it is a trace layer, snapshotter just downloads and extracts it as usual. The overlaybd backstore MUST recognize trace layer, and replay it accordingly.
40+
**!! Note !!**
41+
42+
Both priority list and I/O trace are stored as an independent image layer, and MUST always be the uppermost one. Neither image manifest nor container snapshotter needs to know if it is a trace layer, snapshotter just downloads and extracts it as usual. The overlaybd backstore MUST recognize trace layer, and replay it accordingly.
2043

2144
## Terminology
2245

@@ -42,14 +65,18 @@ After Recording and Pushing, users could pull and run the specific image somewhe
4265

4366
The example usage of building a new image with trace layer would be as follows:
4467
```
45-
bin/ctr rpull --download-blobs <old_image>
68+
bin/ctr rpull --download-blobs <image>
69+
70+
## trace prefetch
71+
bin/ctr record-trace --time 20 <image> <image_with_trace>
4672
47-
bin/ctr record-trace --time 20 <old_image> <local>
73+
## prioritized files
74+
bin/ctr record-trace --priority_list <path/to/filelist> <image> <image_with_trace>
4875
49-
ctr i push <new_image> <local>
76+
ctr i push <image_with_trace>
5077
```
5178

52-
Note the `old_image` must be in overlaybd format. A temporary container will be created and do the recording. The recording progress will be terminated by either timeout, or user signals.
79+
Note the `<image>` must be in overlaybd format. A temporary container will be created and do the recording. The recording progress will be terminated by either timeout, or user signals.
5380

5481
Due to current limitations, this command might ask you remove the old image locally, in order to prepare a clean environment for the recording.
5582

pkg/snapshot/overlay.go

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -208,6 +208,13 @@ func NewSnapshotter(bootConfig *BootConfig, opts ...Opt) (snapshots.Snapshotter,
208208
return nil, err
209209
}
210210

211+
root, err := filepath.EvalSymlinks(bootConfig.Root)
212+
if err != nil {
213+
log.L.Errorf("invalid root: %s. (%s)", bootConfig.Root, err.Error())
214+
return nil, err
215+
}
216+
log.L.Infof("new snapshotter: root = %s", root)
217+
211218
metacopyOption := ""
212219
if _, err := os.Stat("/sys/module/overlay/parameters/metacopy"); err == nil {
213220
metacopyOption = "metacopy=on"
@@ -224,7 +231,7 @@ func NewSnapshotter(bootConfig *BootConfig, opts ...Opt) (snapshots.Snapshotter,
224231
}
225232

226233
return &snapshotter{
227-
root: bootConfig.Root,
234+
root: root,
228235
rwMode: bootConfig.RwMode,
229236
ms: ms,
230237
indexOff: indexOff,

0 commit comments

Comments
 (0)