@@ -36,6 +36,7 @@ type runner struct {
36
36
metadata * core.Metadata
37
37
runType string
38
38
jobInfo * core.JobInfo
39
+ monitoring bool
39
40
start time.Time
40
41
isDone chan struct {}
41
42
perfDone <- chan struct {}
@@ -92,12 +93,8 @@ func (self *runner) Init() {
92
93
util .PrintError (jErr , "monitor" ,
93
94
"Could not update log journal file. Continuing, hoping for the best." )
94
95
}
95
- // Check that the vmem limit is enough for the parent process plus
96
- // the a half gigabyte of margin over the job's physical memory
97
- // requirement.
98
- mem , _ := core .GetProcessTreeMemory (self .jobInfo .Pid , true , nil )
99
96
core .CheckMaxVmem (
100
- uint64 (self .jobInfo .MemGB * 1024 + 512 ) * 1024 * 1024 + uint64 ( mem . Vmem ) )
97
+ uint64 (self .jobInfo .VMemGB ) * 1024 * 1024 * 1024 )
101
98
self .setRlimit ()
102
99
}
103
100
@@ -123,6 +120,7 @@ func (self *runner) writeJobinfo() {
123
120
self .Fail (err , "Error reading jobInfo." )
124
121
} else {
125
122
self .jobInfo = jobInfo
123
+ self .monitoring = jobInfo .Monitor == "monitor"
126
124
}
127
125
self .jobInfo .Cwd = self .metadata .FilesPath ()
128
126
self .jobInfo .Host , _ = os .Hostname ()
@@ -216,7 +214,7 @@ func totalCpu(ru *core.RusageInfo) float64 {
216
214
func (self * runner ) Complete () {
217
215
self .done ()
218
216
target := core .CompleteFile
219
- if self .jobInfo . Monitor == "monitor" {
217
+ if self .monitoring {
220
218
if t := time .Since (self .start ); t > time .Minute * 15 {
221
219
if threads := totalCpu (self .jobInfo .RusageInfo ) /
222
220
t .Seconds (); threads > 1.5 * float64 (self .jobInfo .Threads ) {
@@ -228,6 +226,14 @@ func (self *runner) Complete() {
228
226
util .PrintError (writeError , "monitor" , "Could not write errors file." )
229
227
}
230
228
}
229
+ } else if self .jobInfo .RusageInfo .Children .MaxRss > self .jobInfo .MemGB * 1024 * 1024 {
230
+ target = core .Errors
231
+ if writeError := self .metadata .WriteRaw (target , fmt .Sprintf (
232
+ "Stage exceeded its memory quota (using %.1f, allowed %d)" ,
233
+ float64 (self .jobInfo .RusageInfo .Children .MaxRss )/ (1024 * 1024 ),
234
+ self .jobInfo .MemGB )); writeError != nil {
235
+ util .PrintError (writeError , "monitor" , "Could not write errors file." )
236
+ }
231
237
}
232
238
}
233
239
if target == core .CompleteFile {
@@ -277,6 +283,18 @@ func (self *runner) StartJob(args []string) error {
277
283
self .metadata .MetadataFilePath (core .PerfData ),
278
284
self .metadata .MetadataFilePath (core .ProfileOut ))
279
285
}
286
+ if self .monitoring && self .jobInfo .VMemGB > 0 {
287
+ // Exclude mrjob's vmem usage from the rlimit.
288
+ mem , _ := core .GetProcessTreeMemory (self .jobInfo .Pid , true , nil )
289
+ amount := int64 (self .jobInfo .VMemGB )* 1024 * 1024 * 1024 - mem .Vmem
290
+ if amount < mem .Vmem + 1024 * 1024 {
291
+ amount = mem .Vmem + 1024 * 1024
292
+ }
293
+ if err := core .SetVMemRLimit (uint64 (amount )); err != nil {
294
+ util .LogError (err , "monitor" ,
295
+ "Could not set VM rlimit." )
296
+ }
297
+ }
280
298
if err := func () error {
281
299
util .EnterCriticalSection ()
282
300
defer util .ExitCriticalSection ()
@@ -481,33 +499,52 @@ func (self *runner) WaitLoop() {
481
499
}
482
500
}
483
501
484
- func (self * runner ) getChildMemGB () float64 {
502
+ func (self * runner ) getChildMemGB () ( rss , vmem float64 ) {
485
503
proc := self .job .Process
486
504
if proc == nil {
487
- return 0
505
+ return 0 , 0
488
506
}
489
507
io := make (map [int ]* core.IoAmount )
490
508
mem , err := core .GetProcessTreeMemory (proc .Pid , true , io )
509
+ if selfMem , err := core .GetRunningMemory (self .jobInfo .Pid ); err == nil {
510
+ // Do this rather than just calling core.GetProcessTreeMemory,
511
+ // above, because we don't want to include the profiling child
512
+ // process (if any).
513
+ mem .Add (selfMem )
514
+ }
491
515
mem .IncreaseRusage (core .GetRusage ())
492
516
self .highMem .IncreaseTo (mem )
493
517
if err != nil {
494
518
util .LogError (err , "monitor" , "Error updating job statistics." )
495
519
} else {
496
520
self .ioStats .Update (io , time .Now ())
497
521
}
498
- return float64 (mem .Rss ) / (1024 * 1024 * 1024 )
522
+ return float64 (mem .Rss ) / (1024 * 1024 * 1024 ),
523
+ float64 (mem .Vmem ) / (1024 * 1024 * 1024 )
499
524
}
500
525
501
526
func (self * runner ) monitor (lastHeartbeat * time.Time ) error {
502
- if mem := self .getChildMemGB (); mem > float64 (self .jobInfo .MemGB ) {
503
- if self .jobInfo . Monitor == "monitor" {
527
+ if rss , vmem := self .getChildMemGB (); rss > float64 (self .jobInfo .MemGB ) {
528
+ if self .monitoring {
504
529
self .job .Process .Kill ()
505
- return fmt .Errorf ("Stage exceeded its memory quota (using %.1f, allowed %dG)" ,
506
- mem , self .jobInfo .MemGB )
530
+ return fmt .Errorf (
531
+ "Stage exceeded its memory quota (using %.1f, allowed %dG)" ,
532
+ rss , self .jobInfo .MemGB )
507
533
} else {
508
534
util .LogInfo ("monitor" ,
509
535
"Stage exceeded its memory quota (using %.1f, allowed %dG)" ,
510
- mem , self .jobInfo .MemGB )
536
+ rss , self .jobInfo .MemGB )
537
+ }
538
+ } else if self .jobInfo .VMemGB > 0 && vmem > float64 (self .jobInfo .VMemGB ) {
539
+ if self .monitoring {
540
+ self .job .Process .Kill ()
541
+ return fmt .Errorf (
542
+ "Stage exceeded its address space quota (using %.1f, allowed %dG)" ,
543
+ vmem , self .jobInfo .VMemGB )
544
+ } else {
545
+ util .LogInfo ("monitor" ,
546
+ "Stage exceeded its address space quota (using %.1f, allowed %dG)" ,
547
+ vmem , self .jobInfo .MemGB )
511
548
}
512
549
}
513
550
if time .Since (* lastHeartbeat ) > HeartbeatInterval {
0 commit comments