Skip to content

Commit 11ce171

Browse files
authored
GC: report times spent in each step of sweeping (#56993)
We've been using this patch for a while (e.g., RelationalAI#176), and it has proven valuable for identifying the most expensive part of sweeping in a customer workload. Specifically, it highlighted that the madvise stage was the bottleneck. These metrics allowed us to effectively determine whether concurrent page sweeping would be beneficial.
1 parent 0868bec commit 11ce171

File tree

3 files changed

+60
-43
lines changed

3 files changed

+60
-43
lines changed

base/timing.jl

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,9 @@ struct GC_Num
2424
mark_time ::Int64
2525
stack_pool_sweep_time ::Int64
2626
total_sweep_time ::Int64
27+
total_sweep_page_walk_time ::Int64
28+
total_sweep_madvise_time ::Int64
29+
total_sweep_free_mallocd_memory_time ::Int64
2730
total_mark_time ::Int64
2831
total_stack_pool_sweep_time::Int64
2932
last_full_sweep ::Int64

src/gc-interface.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,9 @@ typedef struct {
4646
uint64_t mark_time;
4747
uint64_t stack_pool_sweep_time;
4848
uint64_t total_sweep_time;
49+
uint64_t total_sweep_page_walk_time;
50+
uint64_t total_sweep_madvise_time;
51+
uint64_t total_sweep_free_mallocd_memory_time;
4952
uint64_t total_mark_time;
5053
uint64_t total_stack_pool_sweep_time;
5154
uint64_t last_full_sweep;

src/gc-stock.c

Lines changed: 54 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -984,9 +984,12 @@ STATIC_INLINE void gc_sweep_pool_page(gc_page_profiler_serializer_t *s, jl_gc_pa
984984
// sweep over all memory that is being used and not in a pool
985985
static void gc_sweep_other(jl_ptls_t ptls, int sweep_full) JL_NOTSAFEPOINT
986986
{
987+
uint64_t t_free_mallocd_memory_start = jl_hrtime();
987988
gc_sweep_foreign_objs();
988989
sweep_malloced_memory();
989990
sweep_big(ptls);
991+
uint64_t t_free_mallocd_memory_end = jl_hrtime();
992+
gc_num.total_sweep_free_mallocd_memory_time += t_free_mallocd_memory_end - t_free_mallocd_memory_start;
990993
jl_engine_sweep(gc_all_tls_states);
991994
}
992995

@@ -1380,66 +1383,74 @@ static void gc_sweep_pool(void)
13801383
}
13811384
}
13821385

1383-
// the actual sweeping
1384-
jl_gc_padded_page_stack_t *new_gc_allocd_scratch = (jl_gc_padded_page_stack_t *) calloc_s(n_threads * sizeof(jl_gc_padded_page_stack_t));
1385-
jl_ptls_t ptls = jl_current_task->ptls;
1386-
gc_sweep_wake_all_pages(ptls, new_gc_allocd_scratch);
1387-
gc_sweep_pool_parallel(ptls);
1388-
gc_sweep_wait_for_all_pages();
1389-
1390-
// reset half-pages pointers
1391-
for (int t_i = 0; t_i < n_threads; t_i++) {
1392-
jl_ptls_t ptls2 = gc_all_tls_states[t_i];
1393-
if (ptls2 != NULL) {
1394-
ptls2->gc_tls.page_metadata_allocd = new_gc_allocd_scratch[t_i].stack;
1395-
for (int i = 0; i < JL_GC_N_POOLS; i++) {
1396-
jl_gc_pool_t *p = &ptls2->gc_tls.heap.norm_pools[i];
1397-
p->newpages = NULL;
1386+
uint64_t t_page_walk_start = jl_hrtime();
1387+
{
1388+
// the actual sweeping
1389+
jl_gc_padded_page_stack_t *new_gc_allocd_scratch = (jl_gc_padded_page_stack_t *) calloc_s(n_threads * sizeof(jl_gc_padded_page_stack_t));
1390+
jl_ptls_t ptls = jl_current_task->ptls;
1391+
gc_sweep_wake_all_pages(ptls, new_gc_allocd_scratch);
1392+
gc_sweep_pool_parallel(ptls);
1393+
gc_sweep_wait_for_all_pages();
1394+
1395+
// reset half-pages pointers
1396+
for (int t_i = 0; t_i < n_threads; t_i++) {
1397+
jl_ptls_t ptls2 = gc_all_tls_states[t_i];
1398+
if (ptls2 != NULL) {
1399+
ptls2->gc_tls.page_metadata_allocd = new_gc_allocd_scratch[t_i].stack;
1400+
for (int i = 0; i < JL_GC_N_POOLS; i++) {
1401+
jl_gc_pool_t *p = &ptls2->gc_tls.heap.norm_pools[i];
1402+
p->newpages = NULL;
1403+
}
13981404
}
13991405
}
1400-
}
14011406

1402-
// merge free lists
1403-
for (int t_i = 0; t_i < n_threads; t_i++) {
1404-
jl_ptls_t ptls2 = gc_all_tls_states[t_i];
1405-
if (ptls2 == NULL) {
1406-
continue;
1407-
}
1408-
jl_gc_pagemeta_t *pg = jl_atomic_load_relaxed(&ptls2->gc_tls.page_metadata_allocd.bottom);
1409-
while (pg != NULL) {
1410-
jl_gc_pagemeta_t *pg2 = pg->next;
1411-
if (pg->fl_begin_offset != UINT16_MAX) {
1412-
char *cur_pg = pg->data;
1413-
jl_taggedvalue_t *fl_beg = (jl_taggedvalue_t*)(cur_pg + pg->fl_begin_offset);
1414-
jl_taggedvalue_t *fl_end = (jl_taggedvalue_t*)(cur_pg + pg->fl_end_offset);
1415-
*pfl[t_i * JL_GC_N_POOLS + pg->pool_n] = fl_beg;
1416-
pfl[t_i * JL_GC_N_POOLS + pg->pool_n] = &fl_end->next;
1407+
// merge free lists
1408+
for (int t_i = 0; t_i < n_threads; t_i++) {
1409+
jl_ptls_t ptls2 = gc_all_tls_states[t_i];
1410+
if (ptls2 == NULL) {
1411+
continue;
1412+
}
1413+
jl_gc_pagemeta_t *pg = jl_atomic_load_relaxed(&ptls2->gc_tls.page_metadata_allocd.bottom);
1414+
while (pg != NULL) {
1415+
jl_gc_pagemeta_t *pg2 = pg->next;
1416+
if (pg->fl_begin_offset != UINT16_MAX) {
1417+
char *cur_pg = pg->data;
1418+
jl_taggedvalue_t *fl_beg = (jl_taggedvalue_t*)(cur_pg + pg->fl_begin_offset);
1419+
jl_taggedvalue_t *fl_end = (jl_taggedvalue_t*)(cur_pg + pg->fl_end_offset);
1420+
*pfl[t_i * JL_GC_N_POOLS + pg->pool_n] = fl_beg;
1421+
pfl[t_i * JL_GC_N_POOLS + pg->pool_n] = &fl_end->next;
1422+
}
1423+
pg = pg2;
14171424
}
1418-
pg = pg2;
14191425
}
1420-
}
14211426

1422-
// null out terminal pointers of free lists
1423-
for (int t_i = 0; t_i < n_threads; t_i++) {
1424-
jl_ptls_t ptls2 = gc_all_tls_states[t_i];
1425-
if (ptls2 != NULL) {
1426-
for (int i = 0; i < JL_GC_N_POOLS; i++) {
1427-
*pfl[t_i * JL_GC_N_POOLS + i] = NULL;
1427+
// null out terminal pointers of free lists
1428+
for (int t_i = 0; t_i < n_threads; t_i++) {
1429+
jl_ptls_t ptls2 = gc_all_tls_states[t_i];
1430+
if (ptls2 != NULL) {
1431+
for (int i = 0; i < JL_GC_N_POOLS; i++) {
1432+
*pfl[t_i * JL_GC_N_POOLS + i] = NULL;
1433+
}
14281434
}
14291435
}
1430-
}
14311436

1432-
// cleanup
1433-
free(pfl);
1434-
free(new_gc_allocd_scratch);
1437+
// cleanup
1438+
free(pfl);
1439+
free(new_gc_allocd_scratch);
1440+
}
1441+
uint64_t t_page_walk_end = jl_hrtime();
1442+
gc_num.total_sweep_page_walk_time += t_page_walk_end - t_page_walk_start;
14351443

14361444
#ifdef _P64 // only enable concurrent sweeping on 64bit
14371445
// wake thread up to sweep concurrently
14381446
if (jl_n_sweepthreads > 0) {
14391447
uv_sem_post(&gc_sweep_assists_needed);
14401448
}
14411449
else {
1450+
uint64_t t_madvise_start = jl_hrtime();
14421451
gc_free_pages();
1452+
uint64_t t_madvise_end = jl_hrtime();
1453+
gc_num.total_sweep_madvise_time += t_madvise_end - t_madvise_start;
14431454
}
14441455
#else
14451456
gc_free_pages();

0 commit comments

Comments
 (0)