exclude certain terms from key phrase processing and removal of figurecontent tag from adi output (#33)

PoulamiDasDA · Poulami Das · web-flow · commit 33efba4678c2 · 2024-09-23T23:44:54.000+05:30
Co-authored-by: Poulami Das &lt;podas@microsoft.com&gt;
diff --git a/adi_function_app/local.settings.json b/adi_function_app/local.settings.json
@@ -0,0 +1,17 @@
+{
+  "IsEncrypted": false,
+  "Values": {
+    "AIService__DocumentIntelligence__Endpoint": "<documentIntelligenceEndpoint>",
+    "AIService__DocumentIntelligence__Key": "<documentIntelligenceKey if not using identity>",
+    "AIService__Language__Endpoint": "<languageEndpoint>",
+    "AIService__Language__Key": "<languageKey if not using identity>",
+    "FunctionApp__ClientId": "<clientId of the function app if using user assigned managed identity>",
+    "IdentityType": "<identityType> # system_assigned or user_assigned or key",
+    "OpenAI__ApiKey": "<openAIKey if using non managed identity>",
+    "OpenAI__ApiVersion": "<openAIApiVersion>",
+    "OpenAI__Endpoint": "<openAIEndpoint>",
+    "OpenAI__MultiModalDeployment": "<openAIEmbeddingDeploymentId>",
+    "StorageAccount__ConnectionString": "<connectionString if using non managed identity>",
+    "StorageAccount__Endpoint": "<Endpoint if using identity based connections>"
+  }
+}
diff --git a/adi_function_app/pre_embedding_cleaner.py b/adi_function_app/pre_embedding_cleaner.py
@@ -71,15 +71,21 @@ def clean_text(src_text: str) -> str:
     try:
         # Define specific patterns for each tag
         tag_patterns = {
-            "figurecontent": r"<!-- FigureContent=(.*?)-->",
+            "figurecontent": r"<!--.*?FigureContent=(.*?)-->",
             "figure": r"<figure>(.*?)</figure>",
             "figures": r"\(figures/\d+\)(.*?)\(figures/\d+\)",
             "figcaption": r"<figcaption>(.*?)</figcaption>",
         }
         cleaned_text = remove_markdown_tags(src_text, tag_patterns)
 
-        # remove line breaks
-        cleaned_text = re.sub(r"\n", "", cleaned_text)
+        # remove html tags
+        cleaned_text = re.sub(r"<.*?>", "", cleaned_text)
+
+        # Replace newline characters with spaces
+        cleaned_text = re.sub(r"\n", " ", cleaned_text)
+
+        # Replace multiple whitespace characters with a single space
+        cleaned_text = re.sub(r"\s+", " ", cleaned_text)
 
         # remove stopwords
         tokens = word_tokenize(cleaned_text, "english")