Skip to content

Commit

Permalink
tflops_tolerance is based from the fastest GPU (#1)
Browse files Browse the repository at this point in the history
* update --tflops-tolerance to compare with the best gpu
  • Loading branch information
glutamatt authored Feb 17, 2025
1 parent 62a13ef commit ca39a94
Show file tree
Hide file tree
Showing 2 changed files with 11 additions and 10 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ Options:
--tolerate-software-throttling
Tolerate software throttling if the TFLOPS are in the acceptable range
--tflops-tolerance <TFLOPS_TOLERANCE>
TFLOPS tolerance (%) from the average If the TFLOPS are within this range, test pass [default: 10]
TFLOPS tolerance (%) compared to best GPU If the TFLOPS are within `tflops_tolerance`% of the best performing GPU, test will pass [default: 10]
-h, --help
Print help
-V, --version
Expand Down
19 changes: 10 additions & 9 deletions src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -38,8 +38,8 @@ struct Args {
/// Tolerate software throttling if the TFLOPS are in the acceptable range
#[clap(long, default_value = "false")]
tolerate_software_throttling: bool,
/// TFLOPS tolerance (%) from the average
/// If the TFLOPS are within this range, test pass
/// TFLOPS tolerance (%) compared to best GPU
/// If the TFLOPS are within `tflops_tolerance`% of the best performing GPU, test will pass
#[clap(long, default_value = "10")]
tflops_tolerance: f64,
}
Expand Down Expand Up @@ -378,15 +378,16 @@ fn are_gpus_healthy(
tolerate_software_throttling: bool,
) -> (bool, Vec<String>) {
let mut reasons = vec![];
let mut avg_flops = 0.0;
for r in burn_results.iter() {
avg_flops += r.flops_avg();
}
avg_flops /= burn_results.len() as f64;
// acceptable_flops is tflops_tolerance% lower than best gpu avg flops
let acceptable_flops: f64 = burn_results
.iter()
.map(|r| r.flops_avg())
.fold(0., |max, avg| {
max.max(avg * (100. - tflops_tolerance) / 100.)
});
for r in burn_results.iter() {
let mut low_flops = false;
// if we have less than tflops_tolerance difference in average flops between GPUs
if (r.flops_avg() - avg_flops).abs() > tflops_tolerance / 100. * avg_flops {
if r.flops_avg() < acceptable_flops {
reasons.push(format!("GPU {} - ", r.gpu_idx) + GPU_FLOPS_REASON);
low_flops = true;
}
Expand Down

0 comments on commit ca39a94

Please sign in to comment.