v7.4.0

BBC-Esq · Feb 16, 2025 · 10f4c1e · 10f4c1e
1 parent 9b266b5
commit 10f4c1e
Show file tree

Hide file tree

Showing 11 changed files with 518 additions and 47 deletions.
diff --git a/src/chart_models_vision.py b/src/chart_models_vision.py
@@ -28,6 +28,8 @@ def create_vision_models_comparison_plot():
         {"model": "Moondream2 - 2b", "cps": 344.97, "memory": 4461.80},
         {"model": "InternVL2.5 - 4b", "cps": 173.57, "memory": 3151.93},
         {"model": "InternVL2.5 - 1b", "cps": 291.18, "memory": 2385.93},
+        {"model": "Ovis2 - 1b", "cps": 286.39, "memory": 4071.93},
+        {"model": "Ovis2 - 2b", "cps": 312.08, "memory": 5846.49},
     ]
 
     df = pd.DataFrame(data)

diff --git a/src/choose_documents_and_vector_model.py b/src/choose_documents_and_vector_model.py
@@ -31,7 +31,6 @@ def choose_documents_directory():
     file_dialog = QFileDialog()
 
     if clicked_button == dir_button:
-        # Directory selection mode
         file_dialog.setFileMode(QFileDialog.Directory)
         file_dialog.setOption(QFileDialog.ShowDirsOnly, True)
         selected_dir = file_dialog.getExistingDirectory(None, "Choose Directory for Database", str(current_dir))
@@ -59,7 +58,6 @@ def choose_documents_directory():
                 except Exception as e:
                     print(f"Error creating symlinks: {e}")
     else:
-        # File selection mode
         file_dialog.setFileMode(QFileDialog.ExistingFiles)
         file_paths = file_dialog.getOpenFileNames(None, "Choose Documents and Images for Database", str(current_dir))[0]
         if file_paths:

diff --git a/src/constants.py b/src/constants.py
@@ -77,7 +77,7 @@
     "ctranslate2==4.5.0",
     "cycler==0.12.1",
     "dataclasses-json==0.6.7",
-    "datasets==3.2.0",
+    "datasets==3.3.0",
     "deepdiff==8.2.0", # required by unstructured
     "dill==0.3.8", # datasets 3.2.0 requires <0.3.9; multiprocess 0.70.16 requires >=0.3.8
     "distro==1.9.0",
@@ -144,7 +144,7 @@
     "numba==0.61.0", # only required by openai-whisper
     "numpy==1.26.4", # langchain libraries <2; numba <2.1; scipy <2.3; chattts <2.0.0
     "olefile==0.47",
-    "openai==1.62.0", # only required by chat_lm_studio.py script and whispers2t (if using openai vanilla backend)
+    "openai==1.63.0", # only required by chat_lm_studio.py script and whispers2t (if using openai vanilla backend)
     "openai-whisper==20240930", # only required by whisper_s2t (if using openai vanilla backend)
     "openpyxl==3.1.5",
     "optimum==1.24.0",
@@ -159,7 +159,7 @@
     "platformdirs==4.3.6",
     "propcache==0.2.1",
     "protobuf==5.29.3",
-    "psutil==6.1.1",
+    "psutil==7.0.0",
     "pyarrow==19.0.0",
     "pybase16384==0.3.8", # only required by chattts
     "pycparser==2.22",
@@ -204,8 +204,11 @@
     "tblib==1.7.0", # tiledb-cloud requires >= 1.7.0 but < 1.8.0
     "tenacity==9.0.0",
     "termcolor==2.5.0",
+    "https://github.com/simonflueckiger/tesserocr-windows_build/releases/download/tesserocr-v2.8.0-tesseract-5.5.0/tesserocr-2.8.0-cp311-cp311-win_amd64.whl",
+    "tessdata==1.0.0",
+    "tessdata.eng==1.0.0",
     "threadpoolctl==3.5.0",
-    "tiktoken==0.8.0",
+    "tiktoken==0.9.0",
     "tiledb==0.33.3",
     "tiledb-cloud==0.13.0",
     "tiledb-vector-search==0.11.0",
@@ -215,7 +218,7 @@
     "transformers==4.48.3",
     "typing-inspect==0.9.0",
     "typing_extensions==4.12.2",
-    "unstructured-client==0.29.0",
+    "unstructured-client==0.30.0",
     "tzdata==2025.1",
     "urllib3==2.3.0", # requests 2.32.3 requires <3
     "vector-quantize-pytorch==1.21.8",
@@ -852,6 +855,16 @@
     }
 }
 
+OCR_MODELS = {
+    'GOT-OCR2': {
+        'precision': 'bfloat16',
+        'size': '716m',
+        'repo_id': 'ctranslate2-4you/GOT-OCR2_0-Customized',
+        'cache_dir': 'ctranslate2-4you--GOT-OCR2_0-Customized',
+        'requires_cuda': True,
+    },
+}
+
 TTS_MODELS = {
     "Kokoro": {
         "model": "Kokoro",
@@ -2747,7 +2760,12 @@ def _generate_button_style(cls, color_values):
     "What is the manage databases Tab?",
     "What is the Settings Tab?",
     "What is the Models Tab?",
-    "What does precision mean?"
+    "What does precision mean?",
+    "What is OCR or Optical Character Recognition?",
+    "What OCR backends are available in this program?",
+    "What is Tesseract?",
+    "What is GOT OCR?",
+    "How can I use optical character recognition in this program?"
 ]
 
 jeeves_system_message = "You are a helpful British butler who clearly and directly answers questions in a succinct fashion based on contexts provided to you. If you cannot find the answer within the contexts simply tell me that the contexts do not provide an answer. However, if the contexts partially address a question you answer based on what the contexts say and then briefly summarize the parts of the question that the contexts didn't provide an answer to.  Also, you should be very respectful to the person asking the question and frequently offer traditional butler services like various fancy drinks, snacks, various butler services like shining of shoes, pressing of suites, and stuff like that. Also, if you can't answer the question at all based on the provided contexts, you should apologize profusely and beg to keep your job.  Lastly, it is essential that if there are no contexts actually provided it means that a user's question wasn't relevant and you should state that you can't answer based off of the contexts because there are none.  And it goes without saying you should refuse to answer any questions that are not directly answerable by the provided contexts.  Moreover, some of the contexts might not have relevant information and you should simply ignore them and focus on only answering a user's question.  I cannot emphasize enough that you must gear your answer towards using this program and based your response off of the contexts you receive.  Lastly, in addition to offering to perform stereotypical butler services in the midst of your response, you must always always always end your response with some kind of offering of butler services even they don't want it."

diff --git a/src/download_model.py b/src/download_model.py
@@ -16,7 +16,8 @@ class ModelDownloadedSignal(QObject):
     "vector": "vector",
     "chat": "chat", 
     "tts": "tts",
-    "jeeves": "jeeves"
+    "jeeves": "jeeves",
+    "ocr": "ocr"
 }
 
 class ModelDownloader(QObject):

diff --git a/src/gui.py b/src/gui.py
@@ -11,6 +11,8 @@
 from ctypes import windll, byref, sizeof, c_void_p, c_int
 from ctypes.wintypes import BOOL, HWND, DWORD
 
+from PySide6.QtCore import QTimer
+
 from PySide6.QtWidgets import (
     QApplication, QWidget, QVBoxLayout, QTabWidget,
     QMenuBar, QHBoxLayout, QMessageBox, QInputDialog
@@ -104,6 +106,9 @@ def init_menu(self):
         self.jeeves_action.triggered.connect(self.open_chat_window)
 
     def open_chat_window(self):
+        self.jeeves_action.setEnabled(False)
+        QTimer.singleShot(5000, lambda: self.jeeves_action.setEnabled(True))
+
         required_folder = script_dir / 'Models' / 'vector' / 'BAAI--bge-small-en-v1.5'
         if not required_folder.exists() or not required_folder.is_dir():
             QMessageBox.warning(
@@ -175,7 +180,6 @@ def main():
 
     app = QApplication(sys.argv)
 
-    # Optionally, set the application font size based on DPI (recommended)
     # font = app.font()
     # font.setPointSize(10)  # Adjust as necessary
     # app.setFont(font)

diff --git a/src/gui_tabs_tools.py b/src/gui_tabs_tools.py
@@ -3,12 +3,10 @@
 from gui_tabs_tools_transcribe import TranscriberToolSettingsTab
 from gui_tabs_tools_vision import VisionToolSettingsTab
 from gui_tabs_tools_scrape import ScrapeDocumentationTab
+from gui_tabs_tools_ocr import OCRToolSettingsTab
 from gui_tabs_tools_misc import MiscTab
 from initialize import restore_vector_db_backup
 from utilities import backup_database
-# from gui_tabs_tools_ocr import OcrToolSettingsTab
-# from gui_tabs_tools_keybert import KeywordExtractorTab
-
 
 class RestoreBackupThread(QThread):
     finished = Signal(bool)
@@ -36,12 +34,11 @@ def __init__(self):
         self.layout = QVBoxLayout(self)
         self.groups = {}
         classes = {
-            "TRANSCRIBE FILE": (TranscriberToolSettingsTab, 1.5),
-            "SCRAPE DOCUMENTATION": (ScrapeDocumentationTab, 1.5),
-            "TEST VISION MODEL": (VisionToolSettingsTab, 6),
-            # "PERFORM OCR": (OcrToolSettingsTab, 2),
-            # "KEYWORD EXTRACTOR": (KeywordExtractorTab, 3),
-            "MISC": (MiscTab, 1),
+            "TRANSCRIBE FILE": (TranscriberToolSettingsTab, 3),
+            "SCRAPE DOCUMENTATION": (ScrapeDocumentationTab, 3),
+            "TEST VISION MODELS": (VisionToolSettingsTab, 2),
+            "OPTICAL CHARACTER RECOGNITION": (OCRToolSettingsTab, 3),
+            "MISC": (MiscTab, 2),
         }
         for title, (TabClass, stretch) in classes.items():
             settings = TabClass()

diff --git a/src/gui_tabs_tools_ocr.py b/src/gui_tabs_tools_ocr.py
@@ -0,0 +1,181 @@
+import multiprocessing
+from pathlib import Path
+import fitz
+from PySide6.QtWidgets import (
+    QWidget, QHBoxLayout, QVBoxLayout, QPushButton, QLabel, 
+    QComboBox, QFileDialog, QMessageBox
+)
+from PySide6.QtCore import QThread, Signal
+from module_ocr import process_documents
+
+def check_cuda_availability():
+    try:
+        import torch
+        return torch.cuda.is_available()
+    except ImportError:
+        return False
+
+def get_pdf_page_count(pdf_path):
+    try:
+        with fitz.open(pdf_path) as doc:
+            return doc.page_count
+    except Exception as e:
+        print(f"Error reading PDF: {e}")
+        return 0
+
+def run_ocr_process(pdf_path, backend, model_path):
+    try:
+        if backend == "tesseract":
+            model_path = None
+        process_documents(
+            pdf_paths=Path(pdf_path),
+            backend=backend,
+            model_path=model_path,
+        )
+        return True, None
+    except Exception as e:
+        return False, str(e)
+
+class OcrWorkerThread(QThread):
+    finished_signal = Signal(bool, str)
+
+    def __init__(self, pdf_path, backend, model_path, parent=None):
+        super().__init__(parent)
+        self.pdf_path = pdf_path
+        self.backend = backend
+        self.model_path = "ctranslate2-4you/GOT-OCR2_0-Customized"
+
+    def run(self):
+        result = run_ocr_process(self.pdf_path, self.backend, self.model_path)
+        self.finished_signal.emit(*result)
+
+class OCRToolSettingsTab(QWidget):
+    ENGINE_MAPPING = {
+        "Tesseract": "tesseract",
+        "GOT_OCR": "got"
+    }
+
+    def __init__(self):
+        super().__init__()
+        self.selected_pdf_file = None
+        self.model_path = None
+        self.create_layout()
+        self.setButtons(True)
+        self.worker_thread = None
+
+    def create_layout(self):
+        main_layout = QVBoxLayout()
+
+        engine_selection_hbox = QHBoxLayout()
+
+        engine_label = QLabel("OCR Engine")
+        engine_selection_hbox.addWidget(engine_label)
+
+        self.engine_combo = QComboBox()
+        self.engine_combo.addItems(["Tesseract", "GOT_OCR"])
+        self.engine_combo.setCurrentText("Tesseract")
+        engine_selection_hbox.addWidget(self.engine_combo)
+
+        self.select_pdf_button = QPushButton("Choose PDF")
+        self.select_pdf_button.clicked.connect(self.select_pdf_file)
+        engine_selection_hbox.addWidget(self.select_pdf_button)
+
+        self.process_button = QPushButton("Process")
+        self.process_button.clicked.connect(self.start_ocr_process)
+        engine_selection_hbox.addWidget(self.process_button)
+
+        engine_selection_hbox.setStretchFactor(engine_label, 1)
+        engine_selection_hbox.setStretchFactor(self.engine_combo, 2)
+        engine_selection_hbox.setStretchFactor(self.select_pdf_button, 1)
+        engine_selection_hbox.setStretchFactor(self.process_button, 1)
+
+        main_layout.addLayout(engine_selection_hbox)
+
+        self.file_path_label = QLabel("No PDF file selected")
+        main_layout.addWidget(self.file_path_label)
+
+        self.status_label = QLabel("")
+        self.status_label.setStyleSheet("color: gray;")
+        main_layout.addWidget(self.status_label)
+
+        self.setLayout(main_layout)
+
+    def setButtons(self, enabled):
+        self.select_pdf_button.setEnabled(enabled)
+        self.process_button.setEnabled(enabled)
+        self.engine_combo.setEnabled(enabled)
+        if enabled:
+            self.status_label.setText("")
+
+    def select_pdf_file(self):
+        current_dir = Path.cwd()
+        file_name, _ = QFileDialog.getOpenFileName(
+            self, 
+            "Select PDF File", 
+            str(current_dir),
+            "PDF Files (*.pdf)"
+        )
+        if file_name:
+            file_path = Path(file_name)
+            short_path = f"...{file_path.parent.name}/{file_path.name}"
+            self.file_path_label.setText(short_path)
+            self.file_path_label.setToolTip(str(file_path.absolute()))
+            self.selected_pdf_file = file_name
+            self.status_label.setText("")
+
+    def show_error_message(self, message):
+        self.status_label.setStyleSheet("color: red;")
+        self.status_label.setText("Error: OCR process failed")
+        QMessageBox.critical(self, "Error", f"OCR process failed:\n{message}")
+
+    def show_success_message(self):
+        self.status_label.setStyleSheet("color: #4CAF50;")
+        self.status_label.setText("Success! A .txt file has been saved in the same directory as the original .pdf.")
+        QMessageBox.information(self, "Success!", "A .txt file has been saved in the same directory as the original .pdf.")
+
+    def start_ocr_process(self):
+        if not self.selected_pdf_file:
+            QMessageBox.warning(self, "Warning", "Please select a PDF file first.")
+            return
+
+        selected_engine = self.engine_combo.currentText()
+        backend = self.ENGINE_MAPPING[selected_engine]
+
+        if backend == "got":
+            if not check_cuda_availability():
+                QMessageBox.warning(
+                    self,
+                    "CUDA Not Available",
+                    "GOT_OCR requires PyTorch with CUDA support. Please use Tesseract instead."
+                )
+                return
+
+            page_count = get_pdf_page_count(self.selected_pdf_file)
+            if page_count > 100:
+                reply = QMessageBox.question(
+                    self,
+                    "Large PDF Warning",
+                    f"The selected PDF has {page_count} pages. "
+                    "Are you sure that you want to proceed with the GOT_OCR backend? "
+                    "This backend processes a single page in approximately 10-15 seconds.",
+                    QMessageBox.StandardButton.Ok | QMessageBox.StandardButton.Cancel
+                )
+                if reply == QMessageBox.StandardButton.Cancel:
+                    return
+
+        self.status_label.setStyleSheet("color: #0074D9;")
+        self.status_label.setText(f"Processing with {selected_engine}...")
+        print(f"Starting OCR process for {self.selected_pdf_file}")
+
+        self.setButtons(False)
+
+        self.worker_thread = OcrWorkerThread(self.selected_pdf_file, backend, self.model_path)
+        self.worker_thread.finished_signal.connect(self.ocr_finished)
+        self.worker_thread.start()
+
+    def ocr_finished(self, success, message):
+        self.setButtons(True)
+        if success:
+            self.show_success_message()
+        else:
+            self.show_error_message(message)