teufortressIndustries
diff --git a/‎Colab-TextGen-GPU.ipynb
+16-33 b/‎Colab-TextGen-GPU.ipynb
+16-33
diff --git a/‎cmd_linux.sh
+1-1 b/‎cmd_linux.sh
+1-1
diff --git a/‎css/html_instruct_style.css
-8 b/‎css/html_instruct_style.css
-8
diff --git a/‎css/main.css
+8 b/‎css/main.css
+8
diff --git a/‎js/main.js
-2 b/‎js/main.js
-2
diff --git a/‎modules/LoRA.py
-2 b/‎modules/LoRA.py
-2
diff --git a/‎modules/chat.py
+18-3 b/‎modules/chat.py
+18-3
diff --git a/‎modules/html_generator.py
+28-1 b/‎modules/html_generator.py
+28-1
diff --git a/‎modules/loaders.py
-10 b/‎modules/loaders.py
-10
diff --git a/‎modules/models.py
-19 b/‎modules/models.py
-19
diff --git a/‎modules/models_settings.py
-2 b/‎modules/models_settings.py
-2
diff --git a/‎modules/shared.py
+2-7 b/‎modules/shared.py
+2-7
diff --git a/‎modules/ui.py
-1 b/‎modules/ui.py
-1
diff --git a/‎modules/ui_chat.py
+2-2 b/‎modules/ui_chat.py
+2-2
@@ -22,7 +22,7 @@
       "source": [
         "# oobabooga/text-generation-webui\n",
         "\n",
-        "After running both cells, a public gradio URL will appear at the bottom in a few minutes. You can optionally generate an API link.\n",
+        "After running both cells, a public gradio URL will appear at the bottom in around 10 minutes. You can optionally generate an API link.\n",
         "\n",
         "* Project page: https://github.com/oobabooga/text-generation-webui\n",
         "* Gradio server status: https://status.gradio.app/"
@@ -53,44 +53,28 @@
         "\n",
         "#@markdown If unsure about the branch, write \"main\" or leave it blank.\n",
         "\n",
-        "import torch\n",
+        "import os\n",
         "from pathlib import Path\n",
         "\n",
+        "os.environ.pop('PYTHONPATH', None)\n",
+        "\n",
         "if Path.cwd().name != 'text-generation-webui':\n",
-        "  print(\"Installing the webui...\")\n",
+        "  print(\"\\033[1;32;1m\\n --> Installing the web UI. This will take a while, but after the initial setup, you can download and test as many models as you like.\\033[0;37;0m\\n\")\n",
         "\n",
         "  !git clone https://github.com/oobabooga/text-generation-webui\n",
         "  %cd text-generation-webui\n",
         "\n",
-        "  torver = torch.__version__\n",
-        "  print(f\"TORCH: {torver}\")\n",
-        "  is_cuda118 = '+cu118' in torver  # 2.1.0+cu118\n",
-        "\n",
-        "  if is_cuda118:\n",
-        "    !python -m pip install --upgrade torch==2.2.1 torchvision==0.17.1 torchaudio==2.2.1 --index-url https://download.pytorch.org/whl/cu118\n",
-        "  else:\n",
-        "    !python -m pip install --upgrade torch==2.2.1 torchvision==0.17.1 torchaudio==2.2.1 --index-url https://download.pytorch.org/whl/cu121\n",
-        "\n",
-        "  textgen_requirements = open('requirements.txt').read().splitlines()\n",
-        "  if is_cuda118:\n",
-        "      textgen_requirements = [req.replace('+cu121', '+cu118').replace('+cu122', '+cu118') for req in textgen_requirements]\n",
-        "  with open('temp_requirements.txt', 'w') as file:\n",
-        "      file.write('\\n'.join(textgen_requirements))\n",
-        "\n",
-        "  !pip install -r temp_requirements.txt --upgrade\n",
-        "\n",
-        "  print(\"\\033[1;32;1m\\n --> If you see a warning about \\\"previously imported packages\\\", just ignore it.\\033[0;37;0m\")\n",
-        "  print(\"\\033[1;32;1m\\n --> There is no need to restart the runtime.\\n\\033[0;37;0m\")\n",
-        "\n",
-        "  try:\n",
-        "    import flash_attn\n",
-        "  except:\n",
-        "    !pip uninstall -y flash_attn\n",
+        "  # Install the project in an isolated environment\n",
+        "  !GPU_CHOICE=A \\\n",
+        "  USE_CUDA118=FALSE \\\n",
+        "  LAUNCH_AFTER_INSTALL=FALSE \\\n",
+        "  INSTALL_EXTENSIONS=FALSE \\\n",
+        "  ./start_linux.sh\n",
         "\n",
         "# Parameters\n",
-        "model_url = \"https://huggingface.co/TheBloke/MythoMax-L2-13B-GPTQ\" #@param {type:\"string\"}\n",
-        "branch = \"gptq-4bit-32g-actorder_True\" #@param {type:\"string\"}\n",
-        "command_line_flags = \"--n-gpu-layers 128 --load-in-4bit --use_double_quant\" #@param {type:\"string\"}\n",
+        "model_url = \"https://huggingface.co/turboderp/gemma-2-9b-it-exl2\" #@param {type:\"string\"}\n",
+        "branch = \"8.0bpw\" #@param {type:\"string\"}\n",
+        "command_line_flags = \"--n-gpu-layers 128 --load-in-4bit --use_double_quant --no_flash_attn\" #@param {type:\"string\"}\n",
         "api = False #@param {type:\"boolean\"}\n",
         "\n",
         "if api:\n",
@@ -116,11 +100,10 @@
         "    output_folder = \"\"\n",
         "\n",
         "# Start the web UI\n",
-        "cmd = f\"python server.py --share\"\n",
+        "cmd = f\"./start_linux.sh {command_line_flags} --share\"\n",
         "if output_folder != \"\":\n",
         "    cmd += f\" --model {output_folder}\"\n",
-        "cmd += f\" {command_line_flags}\"\n",
-        "print(cmd)\n",
+        "\n",
         "!$cmd"
       ],
       "metadata": {
 
@@ -1,4 +1,4 @@
-#!/bin/bash
+#!/usr/bin/env bash
 
 cd "$(dirname "${BASH_SOURCE[0]}")"
 
 
@@ -39,14 +39,6 @@
     margin-bottom: 0 !important;
 }
 
-.dark .message-body p em {
-    color: rgb(198 202 214) !important;
-}
-
-.message-body p em {
-    color: rgb(110 110 110) !important;
-}
-
 .gradio-container .chat .assistant-message {
     padding: 20px;
     background: #f4f4f4;
 
@@ -406,6 +406,14 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
     color: var(--body-text-color);
 }
 
+.dark .message q {
+    color: #f5b031;
+}
+
+.message q::before, .message q::after {
+    content: "";
+}
+
 .message-body li {
     list-style-position: outside;
 }
 
@@ -213,12 +213,10 @@ function doSyntaxHighlighting() {
       renderMathInElement(element, {
         delimiters: [
           { left: "$$", right: "$$", display: true },
-          { left: "$", right: "$", display: false },
           { left: "\\(", right: "\\)", display: false },
           { left: "\\[", right: "\\]", display: true },
         ],
       });
-
     });
 
     observer.observe(targetElement, config);
 
@@ -72,8 +72,6 @@ def add_lora_autogptq(lora_names):
     else:
         if len(lora_names) > 1:
             logger.warning('AutoGPTQ can only work with 1 LoRA at the moment. Only the first one in the list will be loaded.')
-        if not shared.args.no_inject_fused_attention:
-            logger.warning('Fused Attention + AutoGPTQ may break Lora loading. Disable it.')
 
         peft_config = GPTQLoraConfig(
             inference_mode=True,
 
@@ -17,7 +17,11 @@
 import modules.shared as shared
 from modules import utils
 from modules.extensions import apply_extensions
-from modules.html_generator import chat_html_wrapper, make_thumbnail
+from modules.html_generator import (
+    chat_html_wrapper,
+    convert_to_markdown,
+    make_thumbnail
+)
 from modules.logging_colors import logger
 from modules.text_generation import (
     generate_reply,
@@ -368,7 +372,6 @@ def chatbot_wrapper(text, state, regenerate=False, _continue=False, loading_mess
 
 
 def impersonate_wrapper(text, state):
-
     static_output = chat_html_wrapper(state['history'], state['name1'], state['name2'], state['mode'], state['chat_style'], state['character_menu'])
 
     prompt = generate_chat_prompt('', state, impersonate=True)
@@ -488,7 +491,7 @@ def start_new_chat(state):
         greeting = replace_character_names(state['greeting'], state['name1'], state['name2'])
         if greeting != '':
             history['internal'] += [['<|BEGIN-VISIBLE-CHAT|>', greeting]]
-            history['visible'] += [['', apply_extensions('output', greeting, state, is_chat=True)]]
+            history['visible'] += [['', apply_extensions('output', html.escape(greeting), state, is_chat=True)]]
 
     unique_id = datetime.now().strftime('%Y%m%d-%H-%M-%S')
     save_history(history, unique_id, state['character_menu'], state['mode'])
@@ -1044,6 +1047,8 @@ def handle_unique_id_select(state):
     history = load_history(state['unique_id'], state['character_menu'], state['mode'])
     html = redraw_html(history, state['name1'], state['name2'], state['mode'], state['chat_style'], state['character_menu'])
 
+    convert_to_markdown.cache_clear()
+
     return [history, html]
 
 
@@ -1052,6 +1057,8 @@ def handle_start_new_chat_click(state):
     histories = find_all_histories_with_first_prompts(state)
     html = redraw_html(history, state['name1'], state['name2'], state['mode'], state['chat_style'], state['character_menu'])
 
+    convert_to_markdown.cache_clear()
+
     return [history, html, gr.update(choices=histories, value=histories[0][1])]
 
 
@@ -1061,6 +1068,8 @@ def handle_delete_chat_confirm_click(state):
     history, unique_id = load_history_after_deletion(state, index)
     html = redraw_html(history, state['name1'], state['name2'], state['mode'], state['chat_style'], state['character_menu'])
 
+    convert_to_markdown.cache_clear()
+
     return [
         history,
         html,
@@ -1099,6 +1108,8 @@ def handle_upload_chat_history(load_chat_history, state):
 
     html = redraw_html(history, state['name1'], state['name2'], state['mode'], state['chat_style'], state['character_menu'])
 
+    convert_to_markdown.cache_clear()
+
     return [
         history,
         html,
@@ -1119,6 +1130,8 @@ def handle_character_menu_change(state):
     histories = find_all_histories_with_first_prompts(state)
     html = redraw_html(history, state['name1'], state['name2'], state['mode'], state['chat_style'], state['character_menu'])
 
+    convert_to_markdown.cache_clear()
+
     return [
         history,
         html,
@@ -1136,6 +1149,8 @@ def handle_mode_change(state):
     histories = find_all_histories_with_first_prompts(state)
     html = redraw_html(history, state['name1'], state['name2'], state['mode'], state['chat_style'], state['character_menu'])
 
+    convert_to_markdown.cache_clear()
+
     return [
         history,
         html,
 
@@ -42,13 +42,39 @@ def fix_newlines(string):
     return string
 
 
+def replace_quotes(text):
+
+    # Define a list of quote pairs (opening and closing), using HTML entities
+    quote_pairs = [
+        ('&quot;', '&quot;'),  # Double quotes
+        ('&ldquo;', '&rdquo;'),  # Unicode left and right double quotation marks
+        ('&lsquo;', '&rsquo;'),  # Unicode left and right single quotation marks
+        ('&laquo;', '&raquo;'),  # French quotes
+        ('&bdquo;', '&ldquo;'),  # German quotes
+        ('&lsquo;', '&rsquo;'),  # Alternative single quotes
+        ('&#8220;', '&#8221;'),  # Unicode quotes (numeric entities)
+        ('&#x201C;', '&#x201D;'),  # Unicode quotes (hex entities)
+    ]
+
+    # Create a regex pattern that matches any of the quote pairs, including newlines
+    pattern = '|'.join(f'({re.escape(open_q)})(.*?)({re.escape(close_q)})' for open_q, close_q in quote_pairs)
+
+    # Replace matched patterns with <q> tags, keeping original quotes
+    replaced_text = re.sub(pattern, lambda m: f'<q>{m.group(1)}{m.group(2)}{m.group(3)}</q>', text, flags=re.DOTALL)
+
+    return replaced_text
+
+
 def replace_blockquote(m):
     return m.group().replace('\n', '\n> ').replace('\\begin{blockquote}', '').replace('\\end{blockquote}', '')
 
 
-@functools.lru_cache(maxsize=4096)
+@functools.lru_cache(maxsize=None)
 def convert_to_markdown(string):
 
+    # Quote to <q></q>
+    string = replace_quotes(string)
+
     # Blockquote
     string = re.sub(r'(^|[\n])&gt;', r'\1>', string)
     pattern = re.compile(r'\\begin{blockquote}(.*?)\\end{blockquote}', re.DOTALL)
@@ -124,6 +150,7 @@ def convert_to_markdown_wrapped(string, use_cache=True):
 
 
 def generate_basic_html(string):
+    convert_to_markdown.cache_clear()
     string = convert_to_markdown(string)
     string = f'<style>{readable_css}</style><div class="readable-container">{string}</div>'
     return string
 
@@ -127,15 +127,6 @@
         'no_use_fast',
         'autogptq_info',
     ],
-    'AutoAWQ': [
-        'cpu_memory',
-        'gpu_memory',
-        'auto_devices',
-        'max_seq_len',
-        'no_inject_fused_attention',
-        'trust_remote_code',
-        'no_use_fast',
-    ],
     'HQQ': [
         'hqq_backend',
         'trust_remote_code',
@@ -200,7 +191,6 @@ def transformers_samplers():
 loaders_samplers = {
     'Transformers': transformers_samplers(),
     'AutoGPTQ': transformers_samplers(),
-    'AutoAWQ': transformers_samplers(),
     'HQQ': transformers_samplers(),
     'ExLlamav2': {
         'temperature',
 
@@ -75,7 +75,6 @@ def load_model(model_name, loader=None):
         'llamacpp_HF': llamacpp_HF_loader,
         'ExLlamav2': ExLlamav2_loader,
         'ExLlamav2_HF': ExLlamav2_HF_loader,
-        'AutoAWQ': AutoAWQ_loader,
         'HQQ': HQQ_loader,
         'TensorRT-LLM': TensorRT_LLM_loader,
     }
@@ -292,24 +291,6 @@ def llamacpp_HF_loader(model_name):
     return model
 
 
-def AutoAWQ_loader(model_name):
-    from awq import AutoAWQForCausalLM
-
-    model_dir = Path(f'{shared.args.model_dir}/{model_name}')
-
-    model = AutoAWQForCausalLM.from_quantized(
-        quant_path=model_dir,
-        max_new_tokens=shared.args.max_seq_len,
-        trust_remote_code=shared.args.trust_remote_code,
-        fuse_layers=not shared.args.no_inject_fused_attention,
-        max_memory=get_max_memory_dict(),
-        batch_size=1,
-        safetensors=any(model_dir.glob('*.safetensors')),
-    )
-
-    return model
-
-
 def AutoGPTQ_loader(model_name):
     import modules.AutoGPTQ_loader
 
 
@@ -180,8 +180,6 @@ def infer_loader(model_name, model_settings):
         loader = None
     elif (path_to_model / 'quantize_config.json').exists() or ('wbits' in model_settings and isinstance(model_settings['wbits'], int) and model_settings['wbits'] > 0):
         loader = 'ExLlamav2_HF'
-    elif (path_to_model / 'quant_config.json').exists() or re.match(r'.*-awq', model_name.lower()):
-        loader = 'AutoAWQ'
     elif len(list(path_to_model.glob('*.gguf'))) > 0 and path_to_model.is_dir() and (path_to_model / 'tokenizer_config.json').exists():
         loader = 'llamacpp_HF'
     elif len(list(path_to_model.glob('*.gguf'))) > 0:
 
@@ -89,7 +89,7 @@
 
 # Model loader
 group = parser.add_argument_group('Model loader')
-group.add_argument('--loader', type=str, help='Choose the model loader manually, otherwise, it will get autodetected. Valid options: Transformers, llama.cpp, llamacpp_HF, ExLlamav2_HF, ExLlamav2, AutoGPTQ, AutoAWQ.')
+group.add_argument('--loader', type=str, help='Choose the model loader manually, otherwise, it will get autodetected. Valid options: Transformers, llama.cpp, llamacpp_HF, ExLlamav2_HF, ExLlamav2, AutoGPTQ.')
 
 # Transformers/Accelerate
 group = parser.add_argument_group('Transformers/Accelerate')
@@ -160,10 +160,6 @@
 group.add_argument('--wbits', type=int, default=0, help='Load a pre-quantized model with specified precision in bits. 2, 3, 4 and 8 are supported.')
 group.add_argument('--groupsize', type=int, default=-1, help='Group size.')
 
-# AutoAWQ
-group = parser.add_argument_group('AutoAWQ')
-group.add_argument('--no_inject_fused_attention', action='store_true', help='Disable the use of fused attention, which will use less VRAM at the cost of slower inference.')
-
 # HQQ
 group = parser.add_argument_group('HQQ')
 group.add_argument('--hqq-backend', type=str, default='PYTORCH_COMPILE', help='Backend for the HQQ loader. Valid options: PYTORCH, PYTORCH_COMPILE, ATEN.')
@@ -217,6 +213,7 @@
 group.add_argument('--pre_layer', type=int, nargs='+', help='DEPRECATED')
 group.add_argument('--checkpoint', type=str, help='DEPRECATED')
 group.add_argument('--monkey-patch', action='store_true', help='DEPRECATED')
+group.add_argument('--no_inject_fused_attention', action='store_true', help='DEPRECATED')
 
 args = parser.parse_args()
 args_defaults = parser.parse_args([])
@@ -267,8 +264,6 @@ def fix_loader_name(name):
         return 'ExLlamav2'
     elif name in ['exllamav2-hf', 'exllamav2_hf', 'exllama-v2-hf', 'exllama_v2_hf', 'exllama-v2_hf', 'exllama2-hf', 'exllama2_hf', 'exllama-2-hf', 'exllama_2_hf', 'exllama-2_hf']:
         return 'ExLlamav2_HF'
-    elif name in ['autoawq', 'awq', 'auto-awq']:
-        return 'AutoAWQ'
     elif name in ['hqq']:
         return 'HQQ'
     elif name in ['tensorrt', 'tensorrtllm', 'tensorrt_llm', 'tensorrt-llm', 'tensort', 'tensortllm']:
 
@@ -78,7 +78,6 @@ def list_model_elements():
         'groupsize',
         'triton',
         'desc_act',
-        'no_inject_fused_attention',
         'no_inject_fused_mlp',
         'no_use_cuda_fp16',
         'disable_exllama',
 
@@ -84,13 +84,13 @@ def create_ui():
                     shared.gradio['start_with'] = gr.Textbox(label='Start reply with', placeholder='Sure thing!', value=shared.settings['start_with'], elem_classes=['add_scrollbar'])
 
                 with gr.Row():
-                    shared.gradio['mode'] = gr.Radio(choices=['chat', 'chat-instruct', 'instruct'], label='Mode', info='Defines how the chat prompt is generated. In instruct and chat-instruct modes, the instruction template Parameters > Instruction template is used.', elem_id='chat-mode')
+                    shared.gradio['mode'] = gr.Radio(choices=['chat', 'chat-instruct', 'instruct'], value=shared.settings['mode'] if shared.settings['mode'] in ['chat', 'chat-instruct'] else None, label='Mode', info='Defines how the chat prompt is generated. In instruct and chat-instruct modes, the instruction template Parameters > Instruction template is used.', elem_id='chat-mode')
 
                 with gr.Row():
                     shared.gradio['chat_style'] = gr.Dropdown(choices=utils.get_available_chat_styles(), label='Chat style', value=shared.settings['chat_style'], visible=shared.settings['mode'] != 'instruct')
 
                 with gr.Row():
-                    shared.gradio['chat-instruct_command'] = gr.Textbox(value=shared.settings['chat-instruct_command'], lines=12, label='Command for chat-instruct mode', info='<|character|> and <|prompt|> get replaced with the bot name and the regular chat prompt respectively.', visible=False, elem_classes=['add_scrollbar'])
+                    shared.gradio['chat-instruct_command'] = gr.Textbox(value=shared.settings['chat-instruct_command'], lines=12, label='Command for chat-instruct mode', info='<|character|> and <|prompt|> get replaced with the bot name and the regular chat prompt respectively.', visible=shared.settings['mode'] == 'chat-instruct', elem_classes=['add_scrollbar'])
 
 
 def create_chat_settings_ui():
Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-#!/bin/bash`
	`1`	`+#!/usr/bin/env bash`
`2`	`2`
`3`	`3`	`cd "$(dirname "${BASH_SOURCE[0]}")"`
`4`	`4`
Original file line number	Diff line number	Diff line change
`@@ -406,6 +406,14 @@ div.svelte-362y77>, div.svelte-362y77>.form> {`
`406`	`406`	`color: var(--body-text-color);`
`407`	`407`	`}`
`408`	`408`
	`409`	`+.dark .message q {`
	`410`	`+ color: #f5b031;`
	`411`	`+}`
	`412`	`+`
	`413`	`+.message q::before, .message q::after {`
	`414`	`+ content: "";`
	`415`	`+}`
	`416`	`+`
`409`	`417`	`.message-body li {`
`410`	`418`	`list-style-position: outside;`
`411`	`419`	`}`