File tree Expand file tree Collapse file tree 2 files changed +26
-3
lines changed Expand file tree Collapse file tree 2 files changed +26
-3
lines changed Original file line number Diff line number Diff line change
1
+ {
2
+ "IsEncrypted" : false ,
3
+ "Values" : {
4
+ "AIService__DocumentIntelligence__Endpoint" : " <documentIntelligenceEndpoint>" ,
5
+ "AIService__DocumentIntelligence__Key" : " <documentIntelligenceKey if not using identity>" ,
6
+ "AIService__Language__Endpoint" : " <languageEndpoint>" ,
7
+ "AIService__Language__Key" : " <languageKey if not using identity>" ,
8
+ "FunctionApp__ClientId" : " <clientId of the function app if using user assigned managed identity>" ,
9
+ "IdentityType" : " <identityType> # system_assigned or user_assigned or key" ,
10
+ "OpenAI__ApiKey" : " <openAIKey if using non managed identity>" ,
11
+ "OpenAI__ApiVersion" : " <openAIApiVersion>" ,
12
+ "OpenAI__Endpoint" : " <openAIEndpoint>" ,
13
+ "OpenAI__MultiModalDeployment" : " <openAIEmbeddingDeploymentId>" ,
14
+ "StorageAccount__ConnectionString" : " <connectionString if using non managed identity>" ,
15
+ "StorageAccount__Endpoint" : " <Endpoint if using identity based connections>"
16
+ }
17
+ }
Original file line number Diff line number Diff line change @@ -71,15 +71,21 @@ def clean_text(src_text: str) -> str:
71
71
try :
72
72
# Define specific patterns for each tag
73
73
tag_patterns = {
74
- "figurecontent" : r"<!-- FigureContent=(.*?)-->" ,
74
+ "figurecontent" : r"<!--.*? FigureContent=(.*?)-->" ,
75
75
"figure" : r"<figure>(.*?)</figure>" ,
76
76
"figures" : r"\(figures/\d+\)(.*?)\(figures/\d+\)" ,
77
77
"figcaption" : r"<figcaption>(.*?)</figcaption>" ,
78
78
}
79
79
cleaned_text = remove_markdown_tags (src_text , tag_patterns )
80
80
81
- # remove line breaks
82
- cleaned_text = re .sub (r"\n" , "" , cleaned_text )
81
+ # remove html tags
82
+ cleaned_text = re .sub (r"<.*?>" , "" , cleaned_text )
83
+
84
+ # Replace newline characters with spaces
85
+ cleaned_text = re .sub (r"\n" , " " , cleaned_text )
86
+
87
+ # Replace multiple whitespace characters with a single space
88
+ cleaned_text = re .sub (r"\s+" , " " , cleaned_text )
83
89
84
90
# remove stopwords
85
91
tokens = word_tokenize (cleaned_text , "english" )
You can’t perform that action at this time.
0 commit comments