From ea94d0a5ff24f4518f6c6f7eb4d86e38f2ec1d1e Mon Sep 17 00:00:00 2001 From: rachlllg Date: Thu, 6 Mar 2025 09:05:03 -0800 Subject: [PATCH] feat(model): use FLP model, update context window and sentence overlap ratio - Update default config and documentation to use the FLP finetuned sentence transformer model - Update the context window to use 512 tokens to allow for retrieval highlight - Update sentence overlap ratio to overlap by 2 sentences using the new context window --- .env.example | 8 ++++---- README.md | 8 ++++---- inception/config.py | 6 +++--- 3 files changed, 11 insertions(+), 11 deletions(-) diff --git a/.env.example b/.env.example index da98b41..d93c5a2 100644 --- a/.env.example +++ b/.env.example @@ -1,8 +1,8 @@ # Model Settings -TRANSFORMER_MODEL_NAME=nomic-ai/modernbert-embed-base -TRANSFORMER_MODEL_VERSION=d556a88e332558790b210f7bdbe87da2fa94a8d8 -MAX_TOKENS=8192 -OVERLAP_RATIO=0.002 +TRANSFORMER_MODEL_NAME=Free-Law-Project/modernbert-embed-base_finetune_512 +TRANSFORMER_MODEL_VERSION=main +MAX_TOKENS=512 +OVERLAP_RATIO=0.004 MIN_TEXT_LENGTH=1 MAX_TEXT_LENGTH = 10000000 MAX_QUERY_LENGTH = 100 diff --git a/README.md b/README.md index d7efb5c..df81dc9 100644 --- a/README.md +++ b/README.md @@ -12,7 +12,7 @@ The service is optimized to handle two main use cases: ## Features -- Specialized text embedding generation for legal documents using the `nomic-ai/modernbert-embed-base` +- Specialized text embedding generation for legal documents using `Free-Law-Project/modernbert-embed-base_finetune_512`, a `sentence_transformer` model finetuned on top of `nomic-ai/modernbert-embed-base` - Intelligent text chunking optimized for court opinions, based on sentence boundaries - Dedicated CPU-based processing for search queries, ensuring fast response times - GPU acceleration support for processing lengthy court opinions @@ -33,7 +33,7 @@ cp .env.example .env Model Settings: - `TRANSFORMER_MODEL_NAME` - Default: `nomic-ai/modernbert-embed-base` + Default: `Free-Law-Project/modernbert-embed-base_finetune_512` The name or path of the SentenceTransformer model to use for generating embeddings. @@ -45,13 +45,13 @@ Model Settings: - `MAX_TOKENS` - Default: `8192` (Range: 512–10000) + Default: `512` (Range: 256–10000) Maximum number of tokens per chunk when splitting text. If the text exceeds this limit, it is split into multiple chunks based on sentence boundaries. If a sentence exceeds this limit, it is truncated. Sentences are defined by `nltk.tokenize.sent_tokenize`, which follows English heuristics to detect sentence boundaries. - `OVERLAP_RATIO` - Default: `0.002` (Range: 0-0.01) + Default: `0.004` (Range: 0-0.01) The ratio to calculate the number of sentences to overlap between chunks when splitting text. Sentences are defined by `nltk.tokenize.sent_tokenize`, which follows English heuristics to detect sentence boundaries. `num_overlap_sentences = int(MAX_TOKENS * OVERLAP_RATIO)`. diff --git a/inception/config.py b/inception/config.py index 5440a07..b52cd4b 100644 --- a/inception/config.py +++ b/inception/config.py @@ -4,7 +4,7 @@ class Settings(BaseSettings): transformer_model_name: str = Field( - "nomic-ai/modernbert-embed-base", + "Free-Law-Project/modernbert-embed-base_finetune_512", description="Name of the transformer model to use", ) transformer_model_version: str = Field( @@ -12,10 +12,10 @@ class Settings(BaseSettings): description="Version of the transformer model to use", ) max_tokens: int = Field( - 8192, ge=512, le=10000, description="Maximum tokens per chunk" + 512, ge=256, le=10000, description="Maximum tokens per chunk" ) overlap_ratio: float = Field( - 0.002, + 0.004, ge=0, le=0.01, description="Ratio to calculate number of sentence overlap between chunks",