Skip to content

Commit dd97a83

Browse files
authored
Merge pull request oobabooga#6271 from oobabooga/dev
Merge dev branch
2 parents af839d2 + e4624fb commit dd97a83

28 files changed

+135
-123
lines changed

Colab-TextGen-GPU.ipynb

+16-33
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@
2222
"source": [
2323
"# oobabooga/text-generation-webui\n",
2424
"\n",
25-
"After running both cells, a public gradio URL will appear at the bottom in a few minutes. You can optionally generate an API link.\n",
25+
"After running both cells, a public gradio URL will appear at the bottom in around 10 minutes. You can optionally generate an API link.\n",
2626
"\n",
2727
"* Project page: https://github.com/oobabooga/text-generation-webui\n",
2828
"* Gradio server status: https://status.gradio.app/"
@@ -53,44 +53,28 @@
5353
"\n",
5454
"#@markdown If unsure about the branch, write \"main\" or leave it blank.\n",
5555
"\n",
56-
"import torch\n",
56+
"import os\n",
5757
"from pathlib import Path\n",
5858
"\n",
59+
"os.environ.pop('PYTHONPATH', None)\n",
60+
"\n",
5961
"if Path.cwd().name != 'text-generation-webui':\n",
60-
" print(\"Installing the webui...\")\n",
62+
" print(\"\\033[1;32;1m\\n --> Installing the web UI. This will take a while, but after the initial setup, you can download and test as many models as you like.\\033[0;37;0m\\n\")\n",
6163
"\n",
6264
" !git clone https://github.com/oobabooga/text-generation-webui\n",
6365
" %cd text-generation-webui\n",
6466
"\n",
65-
" torver = torch.__version__\n",
66-
" print(f\"TORCH: {torver}\")\n",
67-
" is_cuda118 = '+cu118' in torver # 2.1.0+cu118\n",
68-
"\n",
69-
" if is_cuda118:\n",
70-
" !python -m pip install --upgrade torch==2.2.1 torchvision==0.17.1 torchaudio==2.2.1 --index-url https://download.pytorch.org/whl/cu118\n",
71-
" else:\n",
72-
" !python -m pip install --upgrade torch==2.2.1 torchvision==0.17.1 torchaudio==2.2.1 --index-url https://download.pytorch.org/whl/cu121\n",
73-
"\n",
74-
" textgen_requirements = open('requirements.txt').read().splitlines()\n",
75-
" if is_cuda118:\n",
76-
" textgen_requirements = [req.replace('+cu121', '+cu118').replace('+cu122', '+cu118') for req in textgen_requirements]\n",
77-
" with open('temp_requirements.txt', 'w') as file:\n",
78-
" file.write('\\n'.join(textgen_requirements))\n",
79-
"\n",
80-
" !pip install -r temp_requirements.txt --upgrade\n",
81-
"\n",
82-
" print(\"\\033[1;32;1m\\n --> If you see a warning about \\\"previously imported packages\\\", just ignore it.\\033[0;37;0m\")\n",
83-
" print(\"\\033[1;32;1m\\n --> There is no need to restart the runtime.\\n\\033[0;37;0m\")\n",
84-
"\n",
85-
" try:\n",
86-
" import flash_attn\n",
87-
" except:\n",
88-
" !pip uninstall -y flash_attn\n",
67+
" # Install the project in an isolated environment\n",
68+
" !GPU_CHOICE=A \\\n",
69+
" USE_CUDA118=FALSE \\\n",
70+
" LAUNCH_AFTER_INSTALL=FALSE \\\n",
71+
" INSTALL_EXTENSIONS=FALSE \\\n",
72+
" ./start_linux.sh\n",
8973
"\n",
9074
"# Parameters\n",
91-
"model_url = \"https://huggingface.co/TheBloke/MythoMax-L2-13B-GPTQ\" #@param {type:\"string\"}\n",
92-
"branch = \"gptq-4bit-32g-actorder_True\" #@param {type:\"string\"}\n",
93-
"command_line_flags = \"--n-gpu-layers 128 --load-in-4bit --use_double_quant\" #@param {type:\"string\"}\n",
75+
"model_url = \"https://huggingface.co/turboderp/gemma-2-9b-it-exl2\" #@param {type:\"string\"}\n",
76+
"branch = \"8.0bpw\" #@param {type:\"string\"}\n",
77+
"command_line_flags = \"--n-gpu-layers 128 --load-in-4bit --use_double_quant --no_flash_attn\" #@param {type:\"string\"}\n",
9478
"api = False #@param {type:\"boolean\"}\n",
9579
"\n",
9680
"if api:\n",
@@ -116,11 +100,10 @@
116100
" output_folder = \"\"\n",
117101
"\n",
118102
"# Start the web UI\n",
119-
"cmd = f\"python server.py --share\"\n",
103+
"cmd = f\"./start_linux.sh {command_line_flags} --share\"\n",
120104
"if output_folder != \"\":\n",
121105
" cmd += f\" --model {output_folder}\"\n",
122-
"cmd += f\" {command_line_flags}\"\n",
123-
"print(cmd)\n",
106+
"\n",
124107
"!$cmd"
125108
],
126109
"metadata": {

cmd_linux.sh

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
#!/bin/bash
1+
#!/usr/bin/env bash
22

33
cd "$(dirname "${BASH_SOURCE[0]}")"
44

css/html_instruct_style.css

-8
Original file line numberDiff line numberDiff line change
@@ -39,14 +39,6 @@
3939
margin-bottom: 0 !important;
4040
}
4141

42-
.dark .message-body p em {
43-
color: rgb(198 202 214) !important;
44-
}
45-
46-
.message-body p em {
47-
color: rgb(110 110 110) !important;
48-
}
49-
5042
.gradio-container .chat .assistant-message {
5143
padding: 20px;
5244
background: #f4f4f4;

css/main.css

+8
Original file line numberDiff line numberDiff line change
@@ -406,6 +406,14 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
406406
color: var(--body-text-color);
407407
}
408408

409+
.dark .message q {
410+
color: #f5b031;
411+
}
412+
413+
.message q::before, .message q::after {
414+
content: "";
415+
}
416+
409417
.message-body li {
410418
list-style-position: outside;
411419
}

js/main.js

-2
Original file line numberDiff line numberDiff line change
@@ -213,12 +213,10 @@ function doSyntaxHighlighting() {
213213
renderMathInElement(element, {
214214
delimiters: [
215215
{ left: "$$", right: "$$", display: true },
216-
{ left: "$", right: "$", display: false },
217216
{ left: "\\(", right: "\\)", display: false },
218217
{ left: "\\[", right: "\\]", display: true },
219218
],
220219
});
221-
222220
});
223221

224222
observer.observe(targetElement, config);

modules/LoRA.py

-2
Original file line numberDiff line numberDiff line change
@@ -72,8 +72,6 @@ def add_lora_autogptq(lora_names):
7272
else:
7373
if len(lora_names) > 1:
7474
logger.warning('AutoGPTQ can only work with 1 LoRA at the moment. Only the first one in the list will be loaded.')
75-
if not shared.args.no_inject_fused_attention:
76-
logger.warning('Fused Attention + AutoGPTQ may break Lora loading. Disable it.')
7775

7876
peft_config = GPTQLoraConfig(
7977
inference_mode=True,

modules/chat.py

+18-3
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,11 @@
1717
import modules.shared as shared
1818
from modules import utils
1919
from modules.extensions import apply_extensions
20-
from modules.html_generator import chat_html_wrapper, make_thumbnail
20+
from modules.html_generator import (
21+
chat_html_wrapper,
22+
convert_to_markdown,
23+
make_thumbnail
24+
)
2125
from modules.logging_colors import logger
2226
from modules.text_generation import (
2327
generate_reply,
@@ -368,7 +372,6 @@ def chatbot_wrapper(text, state, regenerate=False, _continue=False, loading_mess
368372

369373

370374
def impersonate_wrapper(text, state):
371-
372375
static_output = chat_html_wrapper(state['history'], state['name1'], state['name2'], state['mode'], state['chat_style'], state['character_menu'])
373376

374377
prompt = generate_chat_prompt('', state, impersonate=True)
@@ -488,7 +491,7 @@ def start_new_chat(state):
488491
greeting = replace_character_names(state['greeting'], state['name1'], state['name2'])
489492
if greeting != '':
490493
history['internal'] += [['<|BEGIN-VISIBLE-CHAT|>', greeting]]
491-
history['visible'] += [['', apply_extensions('output', greeting, state, is_chat=True)]]
494+
history['visible'] += [['', apply_extensions('output', html.escape(greeting), state, is_chat=True)]]
492495

493496
unique_id = datetime.now().strftime('%Y%m%d-%H-%M-%S')
494497
save_history(history, unique_id, state['character_menu'], state['mode'])
@@ -1044,6 +1047,8 @@ def handle_unique_id_select(state):
10441047
history = load_history(state['unique_id'], state['character_menu'], state['mode'])
10451048
html = redraw_html(history, state['name1'], state['name2'], state['mode'], state['chat_style'], state['character_menu'])
10461049

1050+
convert_to_markdown.cache_clear()
1051+
10471052
return [history, html]
10481053

10491054

@@ -1052,6 +1057,8 @@ def handle_start_new_chat_click(state):
10521057
histories = find_all_histories_with_first_prompts(state)
10531058
html = redraw_html(history, state['name1'], state['name2'], state['mode'], state['chat_style'], state['character_menu'])
10541059

1060+
convert_to_markdown.cache_clear()
1061+
10551062
return [history, html, gr.update(choices=histories, value=histories[0][1])]
10561063

10571064

@@ -1061,6 +1068,8 @@ def handle_delete_chat_confirm_click(state):
10611068
history, unique_id = load_history_after_deletion(state, index)
10621069
html = redraw_html(history, state['name1'], state['name2'], state['mode'], state['chat_style'], state['character_menu'])
10631070

1071+
convert_to_markdown.cache_clear()
1072+
10641073
return [
10651074
history,
10661075
html,
@@ -1099,6 +1108,8 @@ def handle_upload_chat_history(load_chat_history, state):
10991108

11001109
html = redraw_html(history, state['name1'], state['name2'], state['mode'], state['chat_style'], state['character_menu'])
11011110

1111+
convert_to_markdown.cache_clear()
1112+
11021113
return [
11031114
history,
11041115
html,
@@ -1119,6 +1130,8 @@ def handle_character_menu_change(state):
11191130
histories = find_all_histories_with_first_prompts(state)
11201131
html = redraw_html(history, state['name1'], state['name2'], state['mode'], state['chat_style'], state['character_menu'])
11211132

1133+
convert_to_markdown.cache_clear()
1134+
11221135
return [
11231136
history,
11241137
html,
@@ -1136,6 +1149,8 @@ def handle_mode_change(state):
11361149
histories = find_all_histories_with_first_prompts(state)
11371150
html = redraw_html(history, state['name1'], state['name2'], state['mode'], state['chat_style'], state['character_menu'])
11381151

1152+
convert_to_markdown.cache_clear()
1153+
11391154
return [
11401155
history,
11411156
html,

modules/html_generator.py

+28-1
Original file line numberDiff line numberDiff line change
@@ -42,13 +42,39 @@ def fix_newlines(string):
4242
return string
4343

4444

45+
def replace_quotes(text):
46+
47+
# Define a list of quote pairs (opening and closing), using HTML entities
48+
quote_pairs = [
49+
('&quot;', '&quot;'), # Double quotes
50+
('&ldquo;', '&rdquo;'), # Unicode left and right double quotation marks
51+
('&lsquo;', '&rsquo;'), # Unicode left and right single quotation marks
52+
('&laquo;', '&raquo;'), # French quotes
53+
('&bdquo;', '&ldquo;'), # German quotes
54+
('&lsquo;', '&rsquo;'), # Alternative single quotes
55+
('&#8220;', '&#8221;'), # Unicode quotes (numeric entities)
56+
('&#x201C;', '&#x201D;'), # Unicode quotes (hex entities)
57+
]
58+
59+
# Create a regex pattern that matches any of the quote pairs, including newlines
60+
pattern = '|'.join(f'({re.escape(open_q)})(.*?)({re.escape(close_q)})' for open_q, close_q in quote_pairs)
61+
62+
# Replace matched patterns with <q> tags, keeping original quotes
63+
replaced_text = re.sub(pattern, lambda m: f'<q>{m.group(1)}{m.group(2)}{m.group(3)}</q>', text, flags=re.DOTALL)
64+
65+
return replaced_text
66+
67+
4568
def replace_blockquote(m):
4669
return m.group().replace('\n', '\n> ').replace('\\begin{blockquote}', '').replace('\\end{blockquote}', '')
4770

4871

49-
@functools.lru_cache(maxsize=4096)
72+
@functools.lru_cache(maxsize=None)
5073
def convert_to_markdown(string):
5174

75+
# Quote to <q></q>
76+
string = replace_quotes(string)
77+
5278
# Blockquote
5379
string = re.sub(r'(^|[\n])&gt;', r'\1>', string)
5480
pattern = re.compile(r'\\begin{blockquote}(.*?)\\end{blockquote}', re.DOTALL)
@@ -124,6 +150,7 @@ def convert_to_markdown_wrapped(string, use_cache=True):
124150

125151

126152
def generate_basic_html(string):
153+
convert_to_markdown.cache_clear()
127154
string = convert_to_markdown(string)
128155
string = f'<style>{readable_css}</style><div class="readable-container">{string}</div>'
129156
return string

modules/loaders.py

-10
Original file line numberDiff line numberDiff line change
@@ -127,15 +127,6 @@
127127
'no_use_fast',
128128
'autogptq_info',
129129
],
130-
'AutoAWQ': [
131-
'cpu_memory',
132-
'gpu_memory',
133-
'auto_devices',
134-
'max_seq_len',
135-
'no_inject_fused_attention',
136-
'trust_remote_code',
137-
'no_use_fast',
138-
],
139130
'HQQ': [
140131
'hqq_backend',
141132
'trust_remote_code',
@@ -200,7 +191,6 @@ def transformers_samplers():
200191
loaders_samplers = {
201192
'Transformers': transformers_samplers(),
202193
'AutoGPTQ': transformers_samplers(),
203-
'AutoAWQ': transformers_samplers(),
204194
'HQQ': transformers_samplers(),
205195
'ExLlamav2': {
206196
'temperature',

modules/models.py

-19
Original file line numberDiff line numberDiff line change
@@ -75,7 +75,6 @@ def load_model(model_name, loader=None):
7575
'llamacpp_HF': llamacpp_HF_loader,
7676
'ExLlamav2': ExLlamav2_loader,
7777
'ExLlamav2_HF': ExLlamav2_HF_loader,
78-
'AutoAWQ': AutoAWQ_loader,
7978
'HQQ': HQQ_loader,
8079
'TensorRT-LLM': TensorRT_LLM_loader,
8180
}
@@ -292,24 +291,6 @@ def llamacpp_HF_loader(model_name):
292291
return model
293292

294293

295-
def AutoAWQ_loader(model_name):
296-
from awq import AutoAWQForCausalLM
297-
298-
model_dir = Path(f'{shared.args.model_dir}/{model_name}')
299-
300-
model = AutoAWQForCausalLM.from_quantized(
301-
quant_path=model_dir,
302-
max_new_tokens=shared.args.max_seq_len,
303-
trust_remote_code=shared.args.trust_remote_code,
304-
fuse_layers=not shared.args.no_inject_fused_attention,
305-
max_memory=get_max_memory_dict(),
306-
batch_size=1,
307-
safetensors=any(model_dir.glob('*.safetensors')),
308-
)
309-
310-
return model
311-
312-
313294
def AutoGPTQ_loader(model_name):
314295
import modules.AutoGPTQ_loader
315296

modules/models_settings.py

-2
Original file line numberDiff line numberDiff line change
@@ -180,8 +180,6 @@ def infer_loader(model_name, model_settings):
180180
loader = None
181181
elif (path_to_model / 'quantize_config.json').exists() or ('wbits' in model_settings and isinstance(model_settings['wbits'], int) and model_settings['wbits'] > 0):
182182
loader = 'ExLlamav2_HF'
183-
elif (path_to_model / 'quant_config.json').exists() or re.match(r'.*-awq', model_name.lower()):
184-
loader = 'AutoAWQ'
185183
elif len(list(path_to_model.glob('*.gguf'))) > 0 and path_to_model.is_dir() and (path_to_model / 'tokenizer_config.json').exists():
186184
loader = 'llamacpp_HF'
187185
elif len(list(path_to_model.glob('*.gguf'))) > 0:

modules/shared.py

+2-7
Original file line numberDiff line numberDiff line change
@@ -89,7 +89,7 @@
8989

9090
# Model loader
9191
group = parser.add_argument_group('Model loader')
92-
group.add_argument('--loader', type=str, help='Choose the model loader manually, otherwise, it will get autodetected. Valid options: Transformers, llama.cpp, llamacpp_HF, ExLlamav2_HF, ExLlamav2, AutoGPTQ, AutoAWQ.')
92+
group.add_argument('--loader', type=str, help='Choose the model loader manually, otherwise, it will get autodetected. Valid options: Transformers, llama.cpp, llamacpp_HF, ExLlamav2_HF, ExLlamav2, AutoGPTQ.')
9393

9494
# Transformers/Accelerate
9595
group = parser.add_argument_group('Transformers/Accelerate')
@@ -160,10 +160,6 @@
160160
group.add_argument('--wbits', type=int, default=0, help='Load a pre-quantized model with specified precision in bits. 2, 3, 4 and 8 are supported.')
161161
group.add_argument('--groupsize', type=int, default=-1, help='Group size.')
162162

163-
# AutoAWQ
164-
group = parser.add_argument_group('AutoAWQ')
165-
group.add_argument('--no_inject_fused_attention', action='store_true', help='Disable the use of fused attention, which will use less VRAM at the cost of slower inference.')
166-
167163
# HQQ
168164
group = parser.add_argument_group('HQQ')
169165
group.add_argument('--hqq-backend', type=str, default='PYTORCH_COMPILE', help='Backend for the HQQ loader. Valid options: PYTORCH, PYTORCH_COMPILE, ATEN.')
@@ -217,6 +213,7 @@
217213
group.add_argument('--pre_layer', type=int, nargs='+', help='DEPRECATED')
218214
group.add_argument('--checkpoint', type=str, help='DEPRECATED')
219215
group.add_argument('--monkey-patch', action='store_true', help='DEPRECATED')
216+
group.add_argument('--no_inject_fused_attention', action='store_true', help='DEPRECATED')
220217

221218
args = parser.parse_args()
222219
args_defaults = parser.parse_args([])
@@ -267,8 +264,6 @@ def fix_loader_name(name):
267264
return 'ExLlamav2'
268265
elif name in ['exllamav2-hf', 'exllamav2_hf', 'exllama-v2-hf', 'exllama_v2_hf', 'exllama-v2_hf', 'exllama2-hf', 'exllama2_hf', 'exllama-2-hf', 'exllama_2_hf', 'exllama-2_hf']:
269266
return 'ExLlamav2_HF'
270-
elif name in ['autoawq', 'awq', 'auto-awq']:
271-
return 'AutoAWQ'
272267
elif name in ['hqq']:
273268
return 'HQQ'
274269
elif name in ['tensorrt', 'tensorrtllm', 'tensorrt_llm', 'tensorrt-llm', 'tensort', 'tensortllm']:

modules/ui.py

-1
Original file line numberDiff line numberDiff line change
@@ -78,7 +78,6 @@ def list_model_elements():
7878
'groupsize',
7979
'triton',
8080
'desc_act',
81-
'no_inject_fused_attention',
8281
'no_inject_fused_mlp',
8382
'no_use_cuda_fp16',
8483
'disable_exllama',

modules/ui_chat.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -84,13 +84,13 @@ def create_ui():
8484
shared.gradio['start_with'] = gr.Textbox(label='Start reply with', placeholder='Sure thing!', value=shared.settings['start_with'], elem_classes=['add_scrollbar'])
8585

8686
with gr.Row():
87-
shared.gradio['mode'] = gr.Radio(choices=['chat', 'chat-instruct', 'instruct'], label='Mode', info='Defines how the chat prompt is generated. In instruct and chat-instruct modes, the instruction template Parameters > Instruction template is used.', elem_id='chat-mode')
87+
shared.gradio['mode'] = gr.Radio(choices=['chat', 'chat-instruct', 'instruct'], value=shared.settings['mode'] if shared.settings['mode'] in ['chat', 'chat-instruct'] else None, label='Mode', info='Defines how the chat prompt is generated. In instruct and chat-instruct modes, the instruction template Parameters > Instruction template is used.', elem_id='chat-mode')
8888

8989
with gr.Row():
9090
shared.gradio['chat_style'] = gr.Dropdown(choices=utils.get_available_chat_styles(), label='Chat style', value=shared.settings['chat_style'], visible=shared.settings['mode'] != 'instruct')
9191

9292
with gr.Row():
93-
shared.gradio['chat-instruct_command'] = gr.Textbox(value=shared.settings['chat-instruct_command'], lines=12, label='Command for chat-instruct mode', info='<|character|> and <|prompt|> get replaced with the bot name and the regular chat prompt respectively.', visible=False, elem_classes=['add_scrollbar'])
93+
shared.gradio['chat-instruct_command'] = gr.Textbox(value=shared.settings['chat-instruct_command'], lines=12, label='Command for chat-instruct mode', info='<|character|> and <|prompt|> get replaced with the bot name and the regular chat prompt respectively.', visible=shared.settings['mode'] == 'chat-instruct', elem_classes=['add_scrollbar'])
9494

9595

9696
def create_chat_settings_ui():

0 commit comments

Comments
 (0)