From ea94d0a5ff24f4518f6c6f7eb4d86e38f2ec1d1e Mon Sep 17 00:00:00 2001
From: rachlllg <rachgao108@gmail.com>
Date: Thu, 6 Mar 2025 09:05:03 -0800
Subject: [PATCH] feat(model): use FLP model, update context window and
 sentence overlap ratio

- Update default config and documentation to use the FLP finetuned sentence transformer model
- Update the context window to use 512 tokens to allow for retrieval highlight
- Update sentence overlap ratio to overlap by 2 sentences using the new context window
---
 .env.example        | 8 ++++----
 README.md           | 8 ++++----
 inception/config.py | 6 +++---
 3 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/.env.example b/.env.example
index da98b41..d93c5a2 100644
--- a/.env.example
+++ b/.env.example
@@ -1,8 +1,8 @@
 # Model Settings
-TRANSFORMER_MODEL_NAME=nomic-ai/modernbert-embed-base
-TRANSFORMER_MODEL_VERSION=d556a88e332558790b210f7bdbe87da2fa94a8d8
-MAX_TOKENS=8192
-OVERLAP_RATIO=0.002
+TRANSFORMER_MODEL_NAME=Free-Law-Project/modernbert-embed-base_finetune_512
+TRANSFORMER_MODEL_VERSION=main
+MAX_TOKENS=512
+OVERLAP_RATIO=0.004
 MIN_TEXT_LENGTH=1
 MAX_TEXT_LENGTH = 10000000
 MAX_QUERY_LENGTH = 100
diff --git a/README.md b/README.md
index d7efb5c..df81dc9 100644
--- a/README.md
+++ b/README.md
@@ -12,7 +12,7 @@ The service is optimized to handle two main use cases:
 
 ## Features
 
-- Specialized text embedding generation for legal documents using the `nomic-ai/modernbert-embed-base`
+- Specialized text embedding generation for legal documents using `Free-Law-Project/modernbert-embed-base_finetune_512`, a `sentence_transformer` model finetuned on top of `nomic-ai/modernbert-embed-base`
 - Intelligent text chunking optimized for court opinions, based on sentence boundaries
 - Dedicated CPU-based processing for search queries, ensuring fast response times
 - GPU acceleration support for processing lengthy court opinions
@@ -33,7 +33,7 @@ cp .env.example .env
 Model Settings:
 - `TRANSFORMER_MODEL_NAME`
 
-    Default: `nomic-ai/modernbert-embed-base`
+    Default: `Free-Law-Project/modernbert-embed-base_finetune_512`
 
     The name or path of the SentenceTransformer model to use for generating embeddings.
 
@@ -45,13 +45,13 @@ Model Settings:
 
 - `MAX_TOKENS`
 
-    Default: `8192` (Range: 512–10000)
+    Default: `512` (Range: 256–10000)
 
     Maximum number of tokens per chunk when splitting text. If the text exceeds this limit, it is split into multiple chunks based on sentence boundaries. If a sentence exceeds this limit, it is truncated. Sentences are defined by `nltk.tokenize.sent_tokenize`, which follows English heuristics to detect sentence boundaries.
 
 - `OVERLAP_RATIO`
 
-    Default: `0.002` (Range: 0-0.01)
+    Default: `0.004` (Range: 0-0.01)
 
     The ratio to calculate the number of sentences to overlap between chunks when splitting text. Sentences are defined by `nltk.tokenize.sent_tokenize`, which follows English heuristics to detect sentence boundaries. `num_overlap_sentences = int(MAX_TOKENS * OVERLAP_RATIO)`.
 
diff --git a/inception/config.py b/inception/config.py
index 5440a07..b52cd4b 100644
--- a/inception/config.py
+++ b/inception/config.py
@@ -4,7 +4,7 @@
 
 class Settings(BaseSettings):
     transformer_model_name: str = Field(
-        "nomic-ai/modernbert-embed-base",
+        "Free-Law-Project/modernbert-embed-base_finetune_512",
         description="Name of the transformer model to use",
     )
     transformer_model_version: str = Field(
@@ -12,10 +12,10 @@ class Settings(BaseSettings):
         description="Version of the transformer model to use",
     )
     max_tokens: int = Field(
-        8192, ge=512, le=10000, description="Maximum tokens per chunk"
+        512, ge=256, le=10000, description="Maximum tokens per chunk"
     )
     overlap_ratio: float = Field(
-        0.002,
+        0.004,
         ge=0,
         le=0.01,
         description="Ratio to calculate number of sentence overlap between chunks",