Skip to content

Commit

Permalink
v7.4.0
Browse files Browse the repository at this point in the history
  • Loading branch information
BBC-Esq authored Feb 16, 2025
1 parent 9b266b5 commit 10f4c1e
Show file tree
Hide file tree
Showing 11 changed files with 518 additions and 47 deletions.
2 changes: 2 additions & 0 deletions src/chart_models_vision.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,8 @@ def create_vision_models_comparison_plot():
{"model": "Moondream2 - 2b", "cps": 344.97, "memory": 4461.80},
{"model": "InternVL2.5 - 4b", "cps": 173.57, "memory": 3151.93},
{"model": "InternVL2.5 - 1b", "cps": 291.18, "memory": 2385.93},
{"model": "Ovis2 - 1b", "cps": 286.39, "memory": 4071.93},
{"model": "Ovis2 - 2b", "cps": 312.08, "memory": 5846.49},
]

df = pd.DataFrame(data)
Expand Down
2 changes: 0 additions & 2 deletions src/choose_documents_and_vector_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,6 @@ def choose_documents_directory():
file_dialog = QFileDialog()

if clicked_button == dir_button:
# Directory selection mode
file_dialog.setFileMode(QFileDialog.Directory)
file_dialog.setOption(QFileDialog.ShowDirsOnly, True)
selected_dir = file_dialog.getExistingDirectory(None, "Choose Directory for Database", str(current_dir))
Expand Down Expand Up @@ -59,7 +58,6 @@ def choose_documents_directory():
except Exception as e:
print(f"Error creating symlinks: {e}")
else:
# File selection mode
file_dialog.setFileMode(QFileDialog.ExistingFiles)
file_paths = file_dialog.getOpenFileNames(None, "Choose Documents and Images for Database", str(current_dir))[0]
if file_paths:
Expand Down
30 changes: 24 additions & 6 deletions src/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@
"ctranslate2==4.5.0",
"cycler==0.12.1",
"dataclasses-json==0.6.7",
"datasets==3.2.0",
"datasets==3.3.0",
"deepdiff==8.2.0", # required by unstructured
"dill==0.3.8", # datasets 3.2.0 requires <0.3.9; multiprocess 0.70.16 requires >=0.3.8
"distro==1.9.0",
Expand Down Expand Up @@ -144,7 +144,7 @@
"numba==0.61.0", # only required by openai-whisper
"numpy==1.26.4", # langchain libraries <2; numba <2.1; scipy <2.3; chattts <2.0.0
"olefile==0.47",
"openai==1.62.0", # only required by chat_lm_studio.py script and whispers2t (if using openai vanilla backend)
"openai==1.63.0", # only required by chat_lm_studio.py script and whispers2t (if using openai vanilla backend)
"openai-whisper==20240930", # only required by whisper_s2t (if using openai vanilla backend)
"openpyxl==3.1.5",
"optimum==1.24.0",
Expand All @@ -159,7 +159,7 @@
"platformdirs==4.3.6",
"propcache==0.2.1",
"protobuf==5.29.3",
"psutil==6.1.1",
"psutil==7.0.0",
"pyarrow==19.0.0",
"pybase16384==0.3.8", # only required by chattts
"pycparser==2.22",
Expand Down Expand Up @@ -204,8 +204,11 @@
"tblib==1.7.0", # tiledb-cloud requires >= 1.7.0 but < 1.8.0
"tenacity==9.0.0",
"termcolor==2.5.0",
"https://github.com/simonflueckiger/tesserocr-windows_build/releases/download/tesserocr-v2.8.0-tesseract-5.5.0/tesserocr-2.8.0-cp311-cp311-win_amd64.whl",
"tessdata==1.0.0",
"tessdata.eng==1.0.0",
"threadpoolctl==3.5.0",
"tiktoken==0.8.0",
"tiktoken==0.9.0",
"tiledb==0.33.3",
"tiledb-cloud==0.13.0",
"tiledb-vector-search==0.11.0",
Expand All @@ -215,7 +218,7 @@
"transformers==4.48.3",
"typing-inspect==0.9.0",
"typing_extensions==4.12.2",
"unstructured-client==0.29.0",
"unstructured-client==0.30.0",
"tzdata==2025.1",
"urllib3==2.3.0", # requests 2.32.3 requires <3
"vector-quantize-pytorch==1.21.8",
Expand Down Expand Up @@ -852,6 +855,16 @@
}
}

OCR_MODELS = {
'GOT-OCR2': {
'precision': 'bfloat16',
'size': '716m',
'repo_id': 'ctranslate2-4you/GOT-OCR2_0-Customized',
'cache_dir': 'ctranslate2-4you--GOT-OCR2_0-Customized',
'requires_cuda': True,
},
}

TTS_MODELS = {
"Kokoro": {
"model": "Kokoro",
Expand Down Expand Up @@ -2747,7 +2760,12 @@ def _generate_button_style(cls, color_values):
"What is the manage databases Tab?",
"What is the Settings Tab?",
"What is the Models Tab?",
"What does precision mean?"
"What does precision mean?",
"What is OCR or Optical Character Recognition?",
"What OCR backends are available in this program?",
"What is Tesseract?",
"What is GOT OCR?",
"How can I use optical character recognition in this program?"
]

jeeves_system_message = "You are a helpful British butler who clearly and directly answers questions in a succinct fashion based on contexts provided to you. If you cannot find the answer within the contexts simply tell me that the contexts do not provide an answer. However, if the contexts partially address a question you answer based on what the contexts say and then briefly summarize the parts of the question that the contexts didn't provide an answer to. Also, you should be very respectful to the person asking the question and frequently offer traditional butler services like various fancy drinks, snacks, various butler services like shining of shoes, pressing of suites, and stuff like that. Also, if you can't answer the question at all based on the provided contexts, you should apologize profusely and beg to keep your job. Lastly, it is essential that if there are no contexts actually provided it means that a user's question wasn't relevant and you should state that you can't answer based off of the contexts because there are none. And it goes without saying you should refuse to answer any questions that are not directly answerable by the provided contexts. Moreover, some of the contexts might not have relevant information and you should simply ignore them and focus on only answering a user's question. I cannot emphasize enough that you must gear your answer towards using this program and based your response off of the contexts you receive. Lastly, in addition to offering to perform stereotypical butler services in the midst of your response, you must always always always end your response with some kind of offering of butler services even they don't want it."
Expand Down
3 changes: 2 additions & 1 deletion src/download_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,8 @@ class ModelDownloadedSignal(QObject):
"vector": "vector",
"chat": "chat",
"tts": "tts",
"jeeves": "jeeves"
"jeeves": "jeeves",
"ocr": "ocr"
}

class ModelDownloader(QObject):
Expand Down
6 changes: 5 additions & 1 deletion src/gui.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@
from ctypes import windll, byref, sizeof, c_void_p, c_int
from ctypes.wintypes import BOOL, HWND, DWORD

from PySide6.QtCore import QTimer

from PySide6.QtWidgets import (
QApplication, QWidget, QVBoxLayout, QTabWidget,
QMenuBar, QHBoxLayout, QMessageBox, QInputDialog
Expand Down Expand Up @@ -104,6 +106,9 @@ def init_menu(self):
self.jeeves_action.triggered.connect(self.open_chat_window)

def open_chat_window(self):
self.jeeves_action.setEnabled(False)
QTimer.singleShot(5000, lambda: self.jeeves_action.setEnabled(True))

required_folder = script_dir / 'Models' / 'vector' / 'BAAI--bge-small-en-v1.5'
if not required_folder.exists() or not required_folder.is_dir():
QMessageBox.warning(
Expand Down Expand Up @@ -175,7 +180,6 @@ def main():

app = QApplication(sys.argv)

# Optionally, set the application font size based on DPI (recommended)
# font = app.font()
# font.setPointSize(10) # Adjust as necessary
# app.setFont(font)
Expand Down
15 changes: 6 additions & 9 deletions src/gui_tabs_tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,10 @@
from gui_tabs_tools_transcribe import TranscriberToolSettingsTab
from gui_tabs_tools_vision import VisionToolSettingsTab
from gui_tabs_tools_scrape import ScrapeDocumentationTab
from gui_tabs_tools_ocr import OCRToolSettingsTab
from gui_tabs_tools_misc import MiscTab
from initialize import restore_vector_db_backup
from utilities import backup_database
# from gui_tabs_tools_ocr import OcrToolSettingsTab
# from gui_tabs_tools_keybert import KeywordExtractorTab


class RestoreBackupThread(QThread):
finished = Signal(bool)
Expand Down Expand Up @@ -36,12 +34,11 @@ def __init__(self):
self.layout = QVBoxLayout(self)
self.groups = {}
classes = {
"TRANSCRIBE FILE": (TranscriberToolSettingsTab, 1.5),
"SCRAPE DOCUMENTATION": (ScrapeDocumentationTab, 1.5),
"TEST VISION MODEL": (VisionToolSettingsTab, 6),
# "PERFORM OCR": (OcrToolSettingsTab, 2),
# "KEYWORD EXTRACTOR": (KeywordExtractorTab, 3),
"MISC": (MiscTab, 1),
"TRANSCRIBE FILE": (TranscriberToolSettingsTab, 3),
"SCRAPE DOCUMENTATION": (ScrapeDocumentationTab, 3),
"TEST VISION MODELS": (VisionToolSettingsTab, 2),
"OPTICAL CHARACTER RECOGNITION": (OCRToolSettingsTab, 3),
"MISC": (MiscTab, 2),
}
for title, (TabClass, stretch) in classes.items():
settings = TabClass()
Expand Down
181 changes: 181 additions & 0 deletions src/gui_tabs_tools_ocr.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,181 @@
import multiprocessing
from pathlib import Path
import fitz
from PySide6.QtWidgets import (
QWidget, QHBoxLayout, QVBoxLayout, QPushButton, QLabel,
QComboBox, QFileDialog, QMessageBox
)
from PySide6.QtCore import QThread, Signal
from module_ocr import process_documents

def check_cuda_availability():
try:
import torch
return torch.cuda.is_available()
except ImportError:
return False

def get_pdf_page_count(pdf_path):
try:
with fitz.open(pdf_path) as doc:
return doc.page_count
except Exception as e:
print(f"Error reading PDF: {e}")
return 0

def run_ocr_process(pdf_path, backend, model_path):
try:
if backend == "tesseract":
model_path = None
process_documents(
pdf_paths=Path(pdf_path),
backend=backend,
model_path=model_path,
)
return True, None
except Exception as e:
return False, str(e)

class OcrWorkerThread(QThread):
finished_signal = Signal(bool, str)

def __init__(self, pdf_path, backend, model_path, parent=None):
super().__init__(parent)
self.pdf_path = pdf_path
self.backend = backend
self.model_path = "ctranslate2-4you/GOT-OCR2_0-Customized"

def run(self):
result = run_ocr_process(self.pdf_path, self.backend, self.model_path)
self.finished_signal.emit(*result)

class OCRToolSettingsTab(QWidget):
ENGINE_MAPPING = {
"Tesseract": "tesseract",
"GOT_OCR": "got"
}

def __init__(self):
super().__init__()
self.selected_pdf_file = None
self.model_path = None
self.create_layout()
self.setButtons(True)
self.worker_thread = None

def create_layout(self):
main_layout = QVBoxLayout()

engine_selection_hbox = QHBoxLayout()

engine_label = QLabel("OCR Engine")
engine_selection_hbox.addWidget(engine_label)

self.engine_combo = QComboBox()
self.engine_combo.addItems(["Tesseract", "GOT_OCR"])
self.engine_combo.setCurrentText("Tesseract")
engine_selection_hbox.addWidget(self.engine_combo)

self.select_pdf_button = QPushButton("Choose PDF")
self.select_pdf_button.clicked.connect(self.select_pdf_file)
engine_selection_hbox.addWidget(self.select_pdf_button)

self.process_button = QPushButton("Process")
self.process_button.clicked.connect(self.start_ocr_process)
engine_selection_hbox.addWidget(self.process_button)

engine_selection_hbox.setStretchFactor(engine_label, 1)
engine_selection_hbox.setStretchFactor(self.engine_combo, 2)
engine_selection_hbox.setStretchFactor(self.select_pdf_button, 1)
engine_selection_hbox.setStretchFactor(self.process_button, 1)

main_layout.addLayout(engine_selection_hbox)

self.file_path_label = QLabel("No PDF file selected")
main_layout.addWidget(self.file_path_label)

self.status_label = QLabel("")
self.status_label.setStyleSheet("color: gray;")
main_layout.addWidget(self.status_label)

self.setLayout(main_layout)

def setButtons(self, enabled):
self.select_pdf_button.setEnabled(enabled)
self.process_button.setEnabled(enabled)
self.engine_combo.setEnabled(enabled)
if enabled:
self.status_label.setText("")

def select_pdf_file(self):
current_dir = Path.cwd()
file_name, _ = QFileDialog.getOpenFileName(
self,
"Select PDF File",
str(current_dir),
"PDF Files (*.pdf)"
)
if file_name:
file_path = Path(file_name)
short_path = f"...{file_path.parent.name}/{file_path.name}"
self.file_path_label.setText(short_path)
self.file_path_label.setToolTip(str(file_path.absolute()))
self.selected_pdf_file = file_name
self.status_label.setText("")

def show_error_message(self, message):
self.status_label.setStyleSheet("color: red;")
self.status_label.setText("Error: OCR process failed")
QMessageBox.critical(self, "Error", f"OCR process failed:\n{message}")

def show_success_message(self):
self.status_label.setStyleSheet("color: #4CAF50;")
self.status_label.setText("Success! A .txt file has been saved in the same directory as the original .pdf.")
QMessageBox.information(self, "Success!", "A .txt file has been saved in the same directory as the original .pdf.")

def start_ocr_process(self):
if not self.selected_pdf_file:
QMessageBox.warning(self, "Warning", "Please select a PDF file first.")
return

selected_engine = self.engine_combo.currentText()
backend = self.ENGINE_MAPPING[selected_engine]

if backend == "got":
if not check_cuda_availability():
QMessageBox.warning(
self,
"CUDA Not Available",
"GOT_OCR requires PyTorch with CUDA support. Please use Tesseract instead."
)
return

page_count = get_pdf_page_count(self.selected_pdf_file)
if page_count > 100:
reply = QMessageBox.question(
self,
"Large PDF Warning",
f"The selected PDF has {page_count} pages. "
"Are you sure that you want to proceed with the GOT_OCR backend? "
"This backend processes a single page in approximately 10-15 seconds.",
QMessageBox.StandardButton.Ok | QMessageBox.StandardButton.Cancel
)
if reply == QMessageBox.StandardButton.Cancel:
return

self.status_label.setStyleSheet("color: #0074D9;")
self.status_label.setText(f"Processing with {selected_engine}...")
print(f"Starting OCR process for {self.selected_pdf_file}")

self.setButtons(False)

self.worker_thread = OcrWorkerThread(self.selected_pdf_file, backend, self.model_path)
self.worker_thread.finished_signal.connect(self.ocr_finished)
self.worker_thread.start()

def ocr_finished(self, success, message):
self.setButtons(True)
if success:
self.show_success_message()
else:
self.show_error_message(message)
Loading

0 comments on commit 10f4c1e

Please sign in to comment.