Skip to content

Commit d01c68f

Browse files
authored
Merge pull request oobabooga#6224 from oobabooga/dev
Merge dev branch
2 parents e813b32 + 05676ca commit d01c68f

18 files changed

+118
-91
lines changed

README.md

+8-5
Original file line numberDiff line numberDiff line change
@@ -392,15 +392,18 @@ Run `python download-model.py --help` to see all the options.
392392

393393
https://colab.research.google.com/github/oobabooga/text-generation-webui/blob/main/Colab-TextGen-GPU.ipynb
394394

395-
## Contributing
395+
## Acknowledgment
396+
397+
In August 2023, [Andreessen Horowitz](https://a16z.com/) (a16z) provided a generous grant to encourage and support my independent work on this project. I am **extremely** grateful for their trust and recognition.
396398

397-
If you would like to contribute to the project, check out the [Contributing guidelines](https://github.com/oobabooga/text-generation-webui/wiki/Contributing-guidelines).
399+
## Links
398400

399-
## Community
401+
#### Community
400402

401403
* Subreddit: https://www.reddit.com/r/oobabooga/
402404
* Discord: https://discord.gg/jwZCF2dPQN
403405

404-
## Acknowledgment
406+
#### Support
405407

406-
In August 2023, [Andreessen Horowitz](https://a16z.com/) (a16z) provided a generous grant to encourage and support my independent work on this project. I am **extremely** grateful for their trust and recognition.
408+
* ko-fi: https://ko-fi.com/oobabooga
409+
* GitHub Sponsors: https://github.com/sponsors/oobabooga

modules/exllamav2.py

+2
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,8 @@ def from_pretrained(self, path_to_model):
4848
config.scale_pos_emb = shared.args.compress_pos_emb
4949
config.scale_alpha_value = shared.args.alpha_value
5050
config.no_flash_attn = shared.args.no_flash_attn
51+
config.no_xformers = shared.args.no_xformers
52+
config.no_sdpa = shared.args.no_sdpa
5153
config.num_experts_per_token = int(shared.args.num_experts_per_token)
5254

5355
model = ExLlamaV2(config)

modules/exllamav2_hf.py

+2
Original file line numberDiff line numberDiff line change
@@ -176,6 +176,8 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
176176
config.scale_pos_emb = shared.args.compress_pos_emb
177177
config.scale_alpha_value = shared.args.alpha_value
178178
config.no_flash_attn = shared.args.no_flash_attn
179+
config.no_xformers = shared.args.no_xformers
180+
config.no_sdpa = shared.args.no_sdpa
179181
config.num_experts_per_token = int(shared.args.num_experts_per_token)
180182

181183
return Exllamav2HF(config)

modules/llama_cpp_python_hijack.py

+14-4
Original file line numberDiff line numberDiff line change
@@ -13,11 +13,21 @@
1313
def llama_cpp_lib():
1414
global imported_module
1515

16+
def module_to_purpose(module_name):
17+
if module_name == 'llama_cpp':
18+
return 'CPU'
19+
elif module_name == 'llama_cpp_cuda_tensorcores':
20+
return 'tensorcores'
21+
elif module_name == 'llama_cpp_cuda':
22+
return 'default'
23+
24+
return 'unknown'
25+
1626
return_lib = None
1727

1828
if shared.args.cpu:
1929
if imported_module and imported_module != 'llama_cpp':
20-
raise Exception(f"Cannot import 'llama_cpp' because '{imported_module}' is already imported. See issue #1575 in llama-cpp-python. Please restart the server before attempting to use a different version of llama-cpp-python.")
30+
raise Exception(f"The {module_to_purpose(imported_module)} version of llama-cpp-python is already loaded. Switching to the CPU version currently requires a server restart.")
2131
try:
2232
return_lib = importlib.import_module('llama_cpp')
2333
imported_module = 'llama_cpp'
@@ -26,7 +36,7 @@ def llama_cpp_lib():
2636

2737
if shared.args.tensorcores and return_lib is None:
2838
if imported_module and imported_module != 'llama_cpp_cuda_tensorcores':
29-
raise Exception(f"Cannot import 'llama_cpp_cuda_tensorcores' because '{imported_module}' is already imported. See issue #1575 in llama-cpp-python. Please restart the server before attempting to use a different version of llama-cpp-python.")
39+
raise Exception(f"The {module_to_purpose(imported_module)} version of llama-cpp-python is already loaded. Switching to the tensorcores version currently requires a server restart.")
3040
try:
3141
return_lib = importlib.import_module('llama_cpp_cuda_tensorcores')
3242
imported_module = 'llama_cpp_cuda_tensorcores'
@@ -35,7 +45,7 @@ def llama_cpp_lib():
3545

3646
if return_lib is None:
3747
if imported_module and imported_module != 'llama_cpp_cuda':
38-
raise Exception(f"Cannot import 'llama_cpp_cuda' because '{imported_module}' is already imported. See issue #1575 in llama-cpp-python. Please restart the server before attempting to use a different version of llama-cpp-python.")
48+
raise Exception(f"The {module_to_purpose(imported_module)} version of llama-cpp-python is already loaded. Switching to the default version currently requires a server restart.")
3949
try:
4050
return_lib = importlib.import_module('llama_cpp_cuda')
4151
imported_module = 'llama_cpp_cuda'
@@ -44,7 +54,7 @@ def llama_cpp_lib():
4454

4555
if return_lib is None and not shared.args.cpu:
4656
if imported_module and imported_module != 'llama_cpp':
47-
raise Exception(f"Cannot import 'llama_cpp' because '{imported_module}' is already imported. See issue #1575 in llama-cpp-python. Please restart the server before attempting to use a different version of llama-cpp-python.")
57+
raise Exception(f"The {module_to_purpose(imported_module)} version of llama-cpp-python is already loaded. Switching to the CPU version currently requires a server restart.")
4858
try:
4959
return_lib = importlib.import_module('llama_cpp')
5060
imported_module = 'llama_cpp'

modules/loaders.py

+4
Original file line numberDiff line numberDiff line change
@@ -84,6 +84,8 @@
8484
'max_seq_len',
8585
'cfg_cache',
8686
'no_flash_attn',
87+
'no_xformers',
88+
'no_sdpa',
8789
'num_experts_per_token',
8890
'cache_8bit',
8991
'cache_4bit',
@@ -97,6 +99,8 @@
9799
'gpu_split',
98100
'max_seq_len',
99101
'no_flash_attn',
102+
'no_xformers',
103+
'no_sdpa',
100104
'num_experts_per_token',
101105
'cache_8bit',
102106
'cache_4bit',

modules/shared.py

+2
Original file line numberDiff line numberDiff line change
@@ -143,6 +143,8 @@
143143
group.add_argument('--max_seq_len', type=int, default=2048, help='Maximum sequence length.')
144144
group.add_argument('--cfg-cache', action='store_true', help='ExLlamav2_HF: Create an additional cache for CFG negative prompts. Necessary to use CFG with that loader.')
145145
group.add_argument('--no_flash_attn', action='store_true', help='Force flash-attention to not be used.')
146+
group.add_argument('--no_xformers', action='store_true', help='Force xformers to not be used.')
147+
group.add_argument('--no_sdpa', action='store_true', help='Force Torch SDPA to not be used.')
146148
group.add_argument('--cache_8bit', action='store_true', help='Use 8-bit cache to save VRAM.')
147149
group.add_argument('--cache_4bit', action='store_true', help='Use Q4 cache to save VRAM.')
148150
group.add_argument('--num_experts_per_token', type=int, default=2, help='Number of experts to use for generation. Applies to MoE models like Mixtral.')

modules/ui.py

+2
Original file line numberDiff line numberDiff line change
@@ -85,6 +85,8 @@ def list_model_elements():
8585
'disable_exllamav2',
8686
'cfg_cache',
8787
'no_flash_attn',
88+
'no_xformers',
89+
'no_sdpa',
8890
'num_experts_per_token',
8991
'cache_8bit',
9092
'cache_4bit',

modules/ui_chat.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -85,7 +85,7 @@ def create_ui():
8585
shared.gradio['start_with'] = gr.Textbox(label='Start reply with', placeholder='Sure thing!', value=shared.settings['start_with'], elem_classes=['add_scrollbar'])
8686

8787
with gr.Row():
88-
shared.gradio['mode'] = gr.Radio(choices=['chat', 'chat-instruct', 'instruct'], label='Mode', info='Defines how the chat prompt is generated. In instruct and chat-instruct modes, the instruction template selected under Parameters > Instruction template must match the current model.', elem_id='chat-mode')
88+
shared.gradio['mode'] = gr.Radio(choices=['chat', 'chat-instruct', 'instruct'], label='Mode', info='Defines how the chat prompt is generated. In instruct and chat-instruct modes, the instruction template Parameters > Instruction template is used.', elem_id='chat-mode')
8989

9090
with gr.Row():
9191
shared.gradio['chat_style'] = gr.Dropdown(choices=utils.get_available_chat_styles(), label='Chat style', value=shared.settings['chat_style'], visible=shared.settings['mode'] != 'instruct')

modules/ui_model_menu.py

+4-2
Original file line numberDiff line numberDiff line change
@@ -119,9 +119,9 @@ def create_ui():
119119
shared.gradio['flash_attn'] = gr.Checkbox(label="flash_attn", value=shared.args.flash_attn, info='Use flash-attention.')
120120
shared.gradio['auto_devices'] = gr.Checkbox(label="auto-devices", value=shared.args.auto_devices)
121121
shared.gradio['tensorcores'] = gr.Checkbox(label="tensorcores", value=shared.args.tensorcores, info='NVIDIA only: use llama-cpp-python compiled with tensor cores support. This increases performance on RTX cards.')
122-
shared.gradio['streaming_llm'] = gr.Checkbox(label="streaming_llm", value=shared.args.streaming_llm, info='(experimental) Activate StreamingLLM to avoid re-evaluating the entire prompt when old messages are removed.')
123122
shared.gradio['cache_8bit'] = gr.Checkbox(label="cache_8bit", value=shared.args.cache_8bit, info='Use 8-bit cache to save VRAM.')
124123
shared.gradio['cache_4bit'] = gr.Checkbox(label="cache_4bit", value=shared.args.cache_4bit, info='Use Q4 cache to save VRAM.')
124+
shared.gradio['streaming_llm'] = gr.Checkbox(label="streaming_llm", value=shared.args.streaming_llm, info='(experimental) Activate StreamingLLM to avoid re-evaluating the entire prompt when old messages are removed.')
125125
shared.gradio['attention_sink_size'] = gr.Number(label="attention_sink_size", value=shared.args.attention_sink_size, precision=0, info='StreamingLLM: number of sink tokens. Only used if the trimmed prompt doesn\'t share a prefix with the old prompt.')
126126
shared.gradio['cpu'] = gr.Checkbox(label="cpu", value=shared.args.cpu, info='llama.cpp: Use llama-cpp-python compiled without GPU acceleration. Transformers: use PyTorch in CPU mode.')
127127
shared.gradio['row_split'] = gr.Checkbox(label="row_split", value=shared.args.row_split, info='Split the model by rows across GPUs. This may improve multi-gpu performance.')
@@ -138,7 +138,9 @@ def create_ui():
138138
shared.gradio['disk'] = gr.Checkbox(label="disk", value=shared.args.disk)
139139
shared.gradio['bf16'] = gr.Checkbox(label="bf16", value=shared.args.bf16)
140140
shared.gradio['autosplit'] = gr.Checkbox(label="autosplit", value=shared.args.autosplit, info='Automatically split the model tensors across the available GPUs.')
141-
shared.gradio['no_flash_attn'] = gr.Checkbox(label="no_flash_attn", value=shared.args.no_flash_attn, info='Force flash-attention to not be used.')
141+
shared.gradio['no_flash_attn'] = gr.Checkbox(label="no_flash_attn", value=shared.args.no_flash_attn)
142+
shared.gradio['no_xformers'] = gr.Checkbox(label="no_xformers", value=shared.args.no_xformers)
143+
shared.gradio['no_sdpa'] = gr.Checkbox(label="no_sdpa", value=shared.args.no_sdpa)
142144
shared.gradio['cfg_cache'] = gr.Checkbox(label="cfg-cache", value=shared.args.cfg_cache, info='Necessary to use CFG with this loader.')
143145
shared.gradio['cpp_runner'] = gr.Checkbox(label="cpp-runner", value=shared.args.cpp_runner, info='Enable inference with ModelRunnerCpp, which is faster than the default ModelRunner.')
144146
shared.gradio['num_experts_per_token'] = gr.Number(label="Number of experts per token", value=shared.args.num_experts_per_token, info='Only applies to MoE models like Mixtral.')

requirements.txt

+18-18
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
accelerate==0.31.*
1+
accelerate==0.32.*
22
aqlm[gpu,cpu]==1.1.6; platform_system == "Linux"
33
auto-gptq==0.7.1
44
bitsandbytes==0.43.*
@@ -35,29 +35,29 @@ sse-starlette==1.6.5
3535
tiktoken
3636

3737
# llama-cpp-python (CPU only, AVX2)
38-
https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.81+cpuavx2-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
39-
https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.81+cpuavx2-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
40-
https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.81+cpuavx2-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
41-
https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.81+cpuavx2-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
38+
https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.82+cpuavx2-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
39+
https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.82+cpuavx2-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
40+
https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.82+cpuavx2-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
41+
https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.82+cpuavx2-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
4242

4343
# llama-cpp-python (CUDA, no tensor cores)
44-
https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.2.81+cu121-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
45-
https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.2.81+cu121-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
46-
https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.2.81+cu121-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
47-
https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.2.81+cu121-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
44+
https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.2.82+cu121-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
45+
https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.2.82+cu121-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
46+
https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.2.82+cu121-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
47+
https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.2.82+cu121-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
4848

4949
# llama-cpp-python (CUDA, tensor cores)
50-
https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.2.81+cu121-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
51-
https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.2.81+cu121-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
52-
https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.2.81+cu121-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
53-
https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.2.81+cu121-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
50+
https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.2.82+cu121-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
51+
https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.2.82+cu121-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
52+
https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.2.82+cu121-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
53+
https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.2.82+cu121-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
5454

5555
# CUDA wheels
56-
https://github.com/oobabooga/exllamav2/releases/download/v0.1.6/exllamav2-0.1.6+cu121.torch2.2.2-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
57-
https://github.com/oobabooga/exllamav2/releases/download/v0.1.6/exllamav2-0.1.6+cu121.torch2.2.2-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
58-
https://github.com/oobabooga/exllamav2/releases/download/v0.1.6/exllamav2-0.1.6+cu121.torch2.2.2-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
59-
https://github.com/oobabooga/exllamav2/releases/download/v0.1.6/exllamav2-0.1.6+cu121.torch2.2.2-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
60-
https://github.com/oobabooga/exllamav2/releases/download/v0.1.6/exllamav2-0.1.6-py3-none-any.whl; platform_system == "Linux" and platform_machine != "x86_64"
56+
https://github.com/oobabooga/exllamav2/releases/download/v0.1.7/exllamav2-0.1.7+cu121.torch2.2.2-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
57+
https://github.com/oobabooga/exllamav2/releases/download/v0.1.7/exllamav2-0.1.7+cu121.torch2.2.2-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
58+
https://github.com/oobabooga/exllamav2/releases/download/v0.1.7/exllamav2-0.1.7+cu121.torch2.2.2-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
59+
https://github.com/oobabooga/exllamav2/releases/download/v0.1.7/exllamav2-0.1.7+cu121.torch2.2.2-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
60+
https://github.com/oobabooga/exllamav2/releases/download/v0.1.7/exllamav2-0.1.7-py3-none-any.whl; platform_system == "Linux" and platform_machine != "x86_64"
6161
https://github.com/oobabooga/flash-attention/releases/download/v2.5.9.post1/flash_attn-2.5.9.post1+cu122torch2.2.2cxx11abiFALSE-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
6262
https://github.com/oobabooga/flash-attention/releases/download/v2.5.9.post1/flash_attn-2.5.9.post1+cu122torch2.2.2cxx11abiFALSE-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
6363
https://github.com/Dao-AILab/flash-attention/releases/download/v2.5.9.post1/flash_attn-2.5.9.post1+cu122torch2.2cxx11abiFALSE-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"

0 commit comments

Comments
 (0)