Skip to content

Commit

Permalink
simplify testing infrastructure (especially the AVAILABLE_MODELS dict…
Browse files Browse the repository at this point in the history
…ionary) (#1121)
  • Loading branch information
paulbkoch authored Feb 13, 2025
1 parent 788b468 commit a6ed0fa
Show file tree
Hide file tree
Showing 7 changed files with 231 additions and 213 deletions.
310 changes: 140 additions & 170 deletions tests/conftest.py
Original file line number Diff line number Diff line change
@@ -1,138 +1,16 @@
import os
import pathlib
import random
import tempfile
import time
import uuid
import pytest
import requests
import importlib

from guidance import models

# The naming convention for the keys is "<loader>_<model>_<host>" where:
# - 'loader' is 'transformers' or 'llamacpp'
# - 'model' contains relevant information about the model itself
# - 'host' is 'cpu' or 'gpu' as appropriate

AVAILABLE_MODELS = {}

# GEMMA 2
AVAILABLE_MODELS["llamacpp_gemma2_9b_cpu"] = lambda: dict(
# Note that this model requires an appropriate
# HF_TOKEN environment variable
name="huggingface_hubllama:bartowski/gemma-2-9b-it-GGUF:gemma-2-9b-it-IQ2_XS.gguf",
kwargs={"verbose": True, "n_ctx": 4096},
)
AVAILABLE_MODELS["transformers_gemma2_9b_cpu"] = lambda: dict(
# Note that this model requires an appropriate
# HF_TOKEN environment variable
name="transformers:google/gemma-2-9b-it",
kwargs={
"quantization_config": importlib.import_module("transformers").BitsAndBytesConfig(load_in_8bit=True),
},
)
AVAILABLE_MODELS["transformers_gemma2_9b_gpu"] = lambda: dict(
# Note that this model requires an appropriate
# HF_TOKEN environment variable
name="transformers:google/gemma-2-9b-it",
kwargs={
"device_map": "cuda:0",
"quantization_config": importlib.import_module("transformers").BitsAndBytesConfig(load_in_4bit=True),
},
)

# GPT 2
AVAILABLE_MODELS["transformers_gpt2_cpu"] = lambda: dict(name="transformers:gpt2", kwargs=dict())
AVAILABLE_MODELS["transformers_gpt2_gpu"] = lambda: dict(name="transformers:gpt2", kwargs={"device_map": "cuda:0"})

# LLAMA 2
AVAILABLE_MODELS["llamacpp_llama2_7b_cpu"] = lambda: dict(
name="huggingface_hubllama:TheBloke/Llama-2-7B-GGUF:llama-2-7b.Q5_K_M.gguf",
kwargs={"verbose": True, "n_ctx": 4096},
)
AVAILABLE_MODELS["llamacpp_llama2_7b_gpu"] = lambda: dict(
name="huggingface_hubllama:TheBloke/Llama-2-7B-GGUF:llama-2-7b.Q5_K_M.gguf",
kwargs={"verbose": True, "n_gpu_layers": -1, "n_ctx": 4096},
)

# LLAMA 3
AVAILABLE_MODELS["transformers_llama3_8b_cpu"] = lambda: dict(
# Note that this model requires an appropriate
# HF_TOKEN environment variable
name="transformers:meta-llama/Meta-Llama-3-8B-Instruct",
kwargs={"trust_remote_code": True, "torch_dtype": importlib.import_module("torch").bfloat16},
)
AVAILABLE_MODELS["transformers_llama3_8b_gpu"] = lambda: dict(
# Note that this model requires an appropriate
# HF_TOKEN environment variable
name="transformers:meta-llama/Meta-Llama-3-8B-Instruct",
kwargs={"trust_remote_code": True, "torch_dtype": importlib.import_module("torch").bfloat16, "device_map": "cuda:0"},
)

# MISTRAL
AVAILABLE_MODELS["transformers_mistral_7b_cpu"] = lambda: dict(
name="transformers:mistralai/Mistral-7B-v0.1", kwargs=dict()
)
AVAILABLE_MODELS["llamacpp_mistral_7b_cpu"] = lambda: dict(
name="huggingface_hubllama:TheBloke/Mistral-7B-Instruct-v0.2-GGUF:mistral-7b-instruct-v0.2.Q8_0.gguf",
kwargs={"verbose": True, "n_ctx": 2048},
)

# PHI 2
AVAILABLE_MODELS["transformers_phi2_cpu"] = lambda: dict(
name="transformers:microsoft/phi-2", kwargs={"trust_remote_code": True}
)
AVAILABLE_MODELS["transformers_phi2_gpu"] = lambda: dict(
name="transformers:microsoft/phi-2",
kwargs={"trust_remote_code": True, "device_map": "cuda:0"},
)

# PHI 3
AVAILABLE_MODELS["transformers_phi3_mini_4k_instruct_cpu"] = lambda: dict(
name="transformers:microsoft/Phi-3-mini-4k-instruct",
kwargs={"trust_remote_code": True},
)
AVAILABLE_MODELS["llamacpp_phi3_mini_4k_instruct_cpu"] = lambda: dict(
name="huggingface_hubllama:microsoft/Phi-3-mini-4k-instruct-gguf:Phi-3-mini-4k-instruct-q4.gguf",
kwargs={"verbose": True, "n_ctx": 4096},
)
AVAILABLE_MODELS["transformers_phi3_small_8k_instruct_gpu"] = lambda: dict(
name="transformers:microsoft/Phi-3-small-8k-instruct",
kwargs={"trust_remote_code": True, "load_in_8bit": True, "device_map": "cuda:0"},
)

# QWEN2DOT5
AVAILABLE_MODELS["transformers_qwen2dot5_0dot5b_cpu"] = lambda: dict(
name="transformers:Qwen/Qwen2.5-0.5B", kwargs=dict()
)
AVAILABLE_MODELS["transformers_qwen2dot5_0dot5b_gpu"] = lambda: dict(
name="transformers:Qwen/Qwen2.5-0.5B", kwargs={"device_map": "cuda:0"}
)
AVAILABLE_MODELS["transformers_qwen2dot5_0dot5b_instruct_cpu"] = lambda: dict(
name="transformers:Qwen/Qwen2.5-0.5B-Instruct", kwargs=dict()
)
AVAILABLE_MODELS["transformers_qwen2dot5_0dot5b_instruct_gpu"] = lambda: dict(
name="transformers:Qwen/Qwen2.5-0.5B-Instruct", kwargs={"device_map": "cuda:0"}
)


# Ensure that asserts from tests/utils.py are rewritten by pytest to show helpful messages
pytest.register_assert_rewrite("tests.utils")


from .utils import get_model

SELECTED_MODEL_ENV_VARIABLE = "GUIDANCE_SELECTED_MODEL"

def pytest_addoption(parser):
SELECTED_MODEL_ENV_VARIABLE = "GUIDANCE_SELECTED_MODEL"
default_model = os.getenv(SELECTED_MODEL_ENV_VARIABLE, "transformers_gpt2_cpu")
parser.addoption(
"--selected_model",
action="store",
default=default_model,
type=str,
choices=AVAILABLE_MODELS.keys(),
help=f"LLM to load when needed. Set default via environment variable {SELECTED_MODEL_ENV_VARIABLE}",
)

Expand All @@ -143,13 +21,7 @@ def selected_model_name(pytestconfig) -> str:


@pytest.fixture(scope="session")
def selected_model_info(selected_model_name: str):
model_info = AVAILABLE_MODELS[selected_model_name]()
return model_info


@pytest.fixture(scope="module")
def selected_model(selected_model_info: str) -> models.Model:
def selected_model(selected_model_name: str) -> models.Model:
"""Get a concrete model for tests
This fixture is for tests which are supposed
Expand All @@ -161,10 +33,145 @@ def selected_model(selected_model_info: str) -> models.Model:
When running the tests, the model used is
controlled by the '--selected_model' command
line argument to pytest.
The naming convention for the keys is "<loader>_<model>_<host>" where:
- 'loader' is 'transformers' or 'llamacpp'
- 'model' contains relevant information about the model itself
- 'host' is 'cpu' or 'gpu' as appropriate
"""
model = get_model(selected_model_info["name"], **(selected_model_info["kwargs"]))
assert model is not None
return model

# GEMMA 2
if selected_model_name == "llamacpp_gemma2_9b_cpu":
# Note that this model requires an appropriate HF_TOKEN environment variable
from huggingface_hub import hf_hub_download

return models.LlamaCpp(
hf_hub_download(
repo_id="bartowski/gemma-2-9b-it-GGUF", filename="gemma-2-9b-it-IQ2_XS.gguf"
),
verbose=True,
n_ctx=4096,
)
if selected_model_name == "transformers_gemma2_9b_cpu":
# Note that this model requires an appropriate HF_TOKEN environment variable
from transformers import BitsAndBytesConfig

return models.Transformers(
"google/gemma-2-9b-it",
quantization_config=BitsAndBytesConfig(load_in_8bit=True),
)
if selected_model_name == "transformers_gemma2_9b_gpu":
# Note that this model requires an appropriate HF_TOKEN environment variable
from transformers import BitsAndBytesConfig

return models.Transformers(
"google/gemma-2-9b-it",
device_map="cuda:0",
quantization_config=BitsAndBytesConfig(load_in_4bit=True),
)

# GPT 2
if selected_model_name == "transformers_gpt2_cpu":
return models.Transformers("gpt2")
if selected_model_name == "transformers_gpt2_gpu":
return models.Transformers("gpt2", device_map="cuda:0")

# LLAMA 2
if selected_model_name == "llamacpp_llama2_7b_cpu":
from huggingface_hub import hf_hub_download

return models.LlamaCpp(
hf_hub_download(repo_id="TheBloke/Llama-2-7B-GGUF", filename="llama-2-7b.Q5_K_M.gguf"),
verbose=True,
n_ctx=4096,
)
if selected_model_name == "llamacpp_llama2_7b_gpu":
from huggingface_hub import hf_hub_download

return models.LlamaCpp(
hf_hub_download(repo_id="TheBloke/Llama-2-7B-GGUF", filename="llama-2-7b.Q5_K_M.gguf"),
verbose=True,
n_ctx=4096,
n_gpu_layers=-1,
)

# LLAMA 3
if selected_model_name == "transformers_llama3_8b_cpu":
# Note that this model requires an appropriate HF_TOKEN environment variable
from torch import bfloat16

return models.Transformers(
"meta-llama/Meta-Llama-3-8B-Instruct",
trust_remote_code=True,
torch_dtype=bfloat16,
)
if selected_model_name == "transformers_llama3_8b_gpu":
# Note that this model requires an appropriate HF_TOKEN environment variable
from torch import bfloat16

return models.Transformers(
"meta-llama/Meta-Llama-3-8B-Instruct",
trust_remote_code=True,
torch_dtype=bfloat16,
device_map="cuda:0",
)

# MISTRAL
if selected_model_name == "transformers_mistral_7b_cpu":
return models.Transformers("mistralai/Mistral-7B-v0.1")
if selected_model_name == "llamacpp_mistral_7b_cpu":
from huggingface_hub import hf_hub_download

return models.LlamaCpp(
hf_hub_download(
repo_id="TheBloke/Mistral-7B-Instruct-v0.2-GGUF",
filename="mistral-7b-instruct-v0.2.Q8_0.gguf",
),
verbose=True,
n_ctx=2048,
)

# PHI 2
if selected_model_name == "transformers_phi2_cpu":
return models.Transformers("microsoft/phi-2", trust_remote_code=True)
if selected_model_name == "transformers_phi2_gpu":
return models.Transformers("microsoft/phi-2", trust_remote_code=True, device_map="cuda:0")

# PHI 3
if selected_model_name == "transformers_phi3_mini_4k_instruct_cpu":
return models.Transformers("microsoft/Phi-3-mini-4k-instruct", trust_remote_code=True)
if selected_model_name == "llamacpp_phi3_mini_4k_instruct_cpu":
from huggingface_hub import hf_hub_download

return models.LlamaCpp(
hf_hub_download(
repo_id="microsoft/Phi-3-mini-4k-instruct-gguf",
filename="Phi-3-mini-4k-instruct-q4.gguf",
),
verbose=True,
n_ctx=4096,
)
if selected_model_name == "transformers_phi3_small_8k_instruct_gpu":
return models.Transformers(
"microsoft/Phi-3-small-8k-instruct",
trust_remote_code=True,
load_in_8bit=True,
device_map="cuda:0",
)

# QWEN2DOT5
if selected_model_name == "transformers_qwen2dot5_0dot5b_cpu":
return models.Transformers("Qwen/Qwen2.5-0.5B")
if selected_model_name == "transformers_qwen2dot5_0dot5b_gpu":
return models.Transformers("Qwen/Qwen2.5-0.5B", device_map="cuda:0")
if selected_model_name == "transformers_qwen2dot5_0dot5b_instruct_cpu":
return models.Transformers("Qwen/Qwen2.5-0.5B-Instruct")
if selected_model_name == "transformers_qwen2dot5_0dot5b_instruct_gpu":
return models.Transformers("Qwen/Qwen2.5-0.5B-Instruct", device_map="cuda:0")

raise ValueError(
f"No support for selected_model_name {selected_model_name}"
) # pragma: no cover


@pytest.fixture(scope="module")
Expand All @@ -179,40 +186,3 @@ def llamacpp_model(selected_model, selected_model_name):
return selected_model
else:
pytest.skip("Requires Llama-Cpp model")


@pytest.fixture(scope="function")
def rate_limiter() -> int:
"""Limit test execution rate
Any test using this fixture will have a
random delay inserted before the test runs.
It can be used as a crude rate limiter for
tests which call external APIs
"""
delay_secs = random.randint(10, 30)
time.sleep(delay_secs)
return delay_secs


@pytest.fixture(scope="session")
def remote_image_url():
return "https://picsum.photos/300/200"


@pytest.fixture(scope="session")
def local_image_path(remote_image_url):
with tempfile.TemporaryDirectory() as temp_dir:
td = pathlib.Path(temp_dir)
filename = f"{str(uuid.uuid4())}.jpg"
with open(td / filename, "wb") as file:
response = requests.get(remote_image_url)
file.write(response.content)
assert (td / filename).exists()
yield td / filename


@pytest.fixture(scope="session")
def local_image_bytes(local_image_path):
with open(local_image_path, "rb") as f:
return f.read()
12 changes: 6 additions & 6 deletions tests/model_specific/llama_cpp/test_llama_cpp.py
Original file line number Diff line number Diff line change
Expand Up @@ -124,47 +124,47 @@ def test_subtoken_forced(llamacpp_model: guidance.models.Model):
assert str(lm) == "How much is 2 + 2? ("


def test_llama_cpp_almost_one_batch(llamacpp_model, selected_model_info):
def test_llama_cpp_almost_one_batch(llamacpp_model):
lm = llamacpp_model
batch_size = lm.engine.model_obj.n_batch
long_str = lm.engine.tokenizer.bos_token.decode("utf-8") * (batch_size - 1)
lm += long_str + gen(max_tokens=10)
assert len(str(lm)) > len(long_str)


def test_llama_cpp_exactly_one_batch(llamacpp_model, selected_model_info):
def test_llama_cpp_exactly_one_batch(llamacpp_model):
lm = llamacpp_model
batch_size = lm.engine.model_obj.n_batch
long_str = lm.engine.tokenizer.bos_token.decode("utf-8") * batch_size
lm += long_str + gen(max_tokens=10)
assert len(str(lm)) > len(long_str)


def test_llama_cpp_more_than_one_batch(llamacpp_model, selected_model_info):
def test_llama_cpp_more_than_one_batch(llamacpp_model):
lm = llamacpp_model
batch_size = lm.engine.model_obj.n_batch
long_str = lm.engine.tokenizer.bos_token.decode("utf-8") * (batch_size + 1)
lm += long_str + gen(max_tokens=10)
assert len(str(lm)) > len(long_str)


def test_llama_cpp_almost_two_batches(llamacpp_model, selected_model_info):
def test_llama_cpp_almost_two_batches(llamacpp_model):
lm = llamacpp_model
batch_size = lm.engine.model_obj.n_batch
long_str = lm.engine.tokenizer.bos_token.decode("utf-8") * ((2 * batch_size) - 1)
lm += long_str + gen(max_tokens=10)
assert len(str(lm)) > len(long_str)


def test_llama_cpp_two_batches(llamacpp_model, selected_model_info):
def test_llama_cpp_two_batches(llamacpp_model):
lm = llamacpp_model
batch_size = lm.engine.model_obj.n_batch
long_str = lm.engine.tokenizer.bos_token.decode("utf-8") * (2 * batch_size)
lm += long_str + gen(max_tokens=10)
assert len(str(lm)) > len(long_str)


def test_llama_cpp_more_than_two_batches(llamacpp_model, selected_model_info):
def test_llama_cpp_more_than_two_batches(llamacpp_model):
lm = llamacpp_model
batch_size = lm.engine.model_obj.n_batch
long_str = lm.engine.tokenizer.bos_token.decode("utf-8") * ((2 * batch_size) + 1)
Expand Down
Loading

0 comments on commit a6ed0fa

Please sign in to comment.