From 51670b0722102aec7d9422735ade31beaac686a3 Mon Sep 17 00:00:00 2001 From: root Date: Tue, 25 Feb 2025 16:52:32 +0000 Subject: [PATCH 1/2] optimal inference with only 16 inf2 cores and batch size 8 (>80% MFU) --- deepseek-r1-aws.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/deepseek-r1-aws.md b/deepseek-r1-aws.md index 775393e9e8..83bf001c6c 100644 --- a/deepseek-r1-aws.md +++ b/deepseek-r1-aws.md @@ -191,9 +191,9 @@ model_name = hf_model_id.split("/")[-1].lower() # Hub Model configuration hub = { "HF_MODEL_ID": model_id, - "HF_NUM_CORES": "24", + "HF_NUM_CORES": "16", "HF_AUTO_CAST_TYPE": "bf16", - "MAX_BATCH_SIZE": "4", + "MAX_BATCH_SIZE": "8", "MAX_INPUT_TOKENS": "3686", "MAX_TOTAL_TOKENS": "4096", } @@ -270,10 +270,10 @@ docker run -p 8080:80 \ -e HF_BATCH_SIZE=4 \ -e HF_SEQUENCE_LENGTH=4096 \ -e HF_AUTO_CAST_TYPE="bf16" \ - -e HF_NUM_CORES=24 \ + -e HF_NUM_CORES=16 \ ghcr.io/huggingface/neuronx-tgi:latest \ --model-id deepseek-ai/DeepSeek-R1-Distill-Llama-70B \ - --max-batch-size 4 \ + --max-batch-size 8 \ --max-total-tokens 4096 ``` From 4187fa8478fa98249a7fb833c71aa6363fa058a5 Mon Sep 17 00:00:00 2001 From: yahavb Date: Tue, 25 Feb 2025 16:58:34 +0000 Subject: [PATCH 2/2] optimal inference with only 16 inf2 cores and batch size 8 (>80% MFU)&&fix the number of --device in the docker option --- deepseek-r1-aws.md | 4 ---- 1 file changed, 4 deletions(-) diff --git a/deepseek-r1-aws.md b/deepseek-r1-aws.md index 83bf001c6c..b80e504fb6 100644 --- a/deepseek-r1-aws.md +++ b/deepseek-r1-aws.md @@ -263,10 +263,6 @@ docker run -p 8080:80 \ --device=/dev/neuron5 \ --device=/dev/neuron6 \ --device=/dev/neuron7 \ - --device=/dev/neuron8 \ - --device=/dev/neuron9 \ - --device=/dev/neuron10 \ - --device=/dev/neuron11 \ -e HF_BATCH_SIZE=4 \ -e HF_SEQUENCE_LENGTH=4096 \ -e HF_AUTO_CAST_TYPE="bf16" \