moveo-ai · nikoszaf41 · Feb 22, 2024 · Feb 21, 2024 · Feb 22, 2024 · Feb 22, 2024
diff --git a/.github/workflows/main.yaml b/.github/workflows/main.yaml
@@ -24,12 +24,6 @@ jobs:
       - name: Install dependencies
         run: make install-dev
 
-      - name: Install nbstripout
-        run: pip install nbstripout
-
-      - name: Remove notebook outputs
-        run: nbstripout --install
-
       - name: Run formatting
         run: make format
 

diff --git a/README.md b/README.md
@@ -43,3 +43,19 @@ Analyze the behavior and performance of Moveo.AI virtual agents.
 ## Used for creating the graphs
 
 https://plotly.com/python/
+
+## Contributing
+
+### To ensure that your commits [exclude any notebook outputs](https://gist.github.com/33eyes/431e3d432f73371509d176d0dfb95b6e) while contributing to the project, execute the following command within your terminal, in the project's root directory:
+
+```bash
+git config filter.strip-notebook-output.clean 'jupyter nbconvert --ClearOutputPreprocessor.enabled=True --to=notebook --stdin --stdout --log-level=ERROR'
+```
+
+### The project uses a Makefile.
+
+To see all the available commands:
+
+```bash
+make help
+```
diff --git a/notebooks/.gitattributes b/notebooks/.gitattributes
@@ -0,0 +1 @@
+*.ipynb filter=strip-notebook-output
diff --git a/notebooks/dialog_flow_analysis.ipynb b/notebooks/dialog_flow_analysis.ipynb
diff --git a/notebooks/human_agent_analysis.ipynb b/notebooks/human_agent_analysis.ipynb
@@ -33,7 +33,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "df = analyze_agents(\"Flows analysis - Agents Acc Issues BR_agents.csv\")\n",
+    "df = analyze_agents(\"acc_issues_GR&CY_agents.csv\")\n",
     "df.head()"
    ]
   },

diff --git a/notebooks/utils/dialog_flow_analysis/common.py b/notebooks/utils/dialog_flow_analysis/common.py
@@ -123,7 +123,7 @@ def fetch_data_analytics_api(session_id: str, account_id: str, api_key: str) ->
     except requests.exceptions.HTTPError as e:
         if e.response.status_code == 404:
             logger.warning(
-                f"Received a 404 from the logs API for session {session_id}."
+                f"Received a 404 from the logs API for session {session_id}. "
                 "This is probably because the session does not exist"
             )
             return []
@@ -211,7 +211,7 @@ def analyze_agents(content_csv_fname: str):
     return df
 
 
-def analyze_flows(content_csv_fname: str):
+def analyze_flows(content_csv_fname: str, min_transitions_displayed=3):
     """
     Analyze flows from a CSV file containing session data.
 
@@ -280,21 +280,23 @@ def analyze_flows(content_csv_fname: str):
     # Generate Sankey diagrams and histograms for visualization
     create_sankey(
         detractors,
-        title="Detractors Journey Flow"
+        title="Detractors Journey Flow "
         f"(Ratings: {RATING_MIN}-{DETRACTORS_UPPER_LIMIT})",
         fname=content_csv_fname,
+        top_k=min_transitions_displayed,
     )
     create_sankey(
         promoters,
         title=f"Promoters Journey Flow (Ratings: {PROMOTERS_LOWER_LIMIT}-{RATING_MAX})",
         fname=content_csv_fname,
+        top_k=min_transitions_displayed,
     )
     create_sankey(
         neutral,
-        title="Neutrals Journey Flow (Ratings:"
+        title="Neutrals Journey Flow (Ratings: "
         f"{DETRACTORS_UPPER_LIMIT + 1}-{PROMOTERS_LOWER_LIMIT - 1})",
         fname=content_csv_fname,
-        top_k=3,
+        top_k=min_transitions_displayed,
     )
     # Generate the ratings histogram
     plot_ratings_histogram(
@@ -329,7 +331,12 @@ def analyze_flows(content_csv_fname: str):
 
 
 # NOTE: top_k should be adjusted to get better results depending on the ammount of data.
-def create_sankey(df: pd.DataFrame, title: str, fname: str = None, top_k=10):
+def create_sankey(
+    df: pd.DataFrame,
+    title: str,
+    fname: str = None,
+    top_k=3,
+):
     """
     Create a Sankey diagram from a DataFrame.
 
@@ -339,7 +346,9 @@ def create_sankey(df: pd.DataFrame, title: str, fname: str = None, top_k=10):
     Args:
         df (pd.DataFrame): DataFrame containing flow data.
         title (str): Title of the Sankey diagram.
-        top_k (int, optional): Min number of transitions to include. Defaults to 10.
+        top_k (int, optional): Min number of transitions to include.
+                Defaults to 3. For a large ammount of data,
+                increase it in order to get readable diagrams.
 
     Returns:
         None
@@ -366,7 +375,7 @@ def create_sankey(df: pd.DataFrame, title: str, fname: str = None, top_k=10):
 
     # Filter transitions by count
     transition_counts = Counter(
-        {item: count for item, count in transition_counts.items() if count > top_k}
+        {item: count for item, count in transition_counts.items() if count >= top_k}
     )
 
     # Get unique states
@@ -383,7 +392,7 @@ def create_sankey(df: pd.DataFrame, title: str, fname: str = None, top_k=10):
     values = []
 
     for (source, target), count in transition_counts.items():
-        if count > 10:
+        if count >= top_k:
             sources.append(state_to_index[source])
             targets.append(state_to_index[target])
             values.append(count)

diff --git a/notebooks/utils/dialog_flow_analysis/zendesk.py b/notebooks/utils/dialog_flow_analysis/zendesk.py
@@ -231,7 +231,7 @@ def split_agents_and_brain_tickets(csv_fname: str, brain_name: str):
             len(df_virtual_assistant) + len(df_agents) + len(df_no_agent)
         )
         if total_rows_output_files == len(df):
-            logger.info("Total rows in output files match total rows in input file.")
+            logger.debug("Total rows in output files match total rows in input file.")
         else:
             logger.warning(
                 "Total rows in output files do not match total rows in input file."

diff --git a/notebooks/zendesk_utilities.ipynb b/notebooks/zendesk_utilities.ipynb
@@ -31,7 +31,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -79,51 +79,14 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdin",
-     "output_type": "stream",
-     "text": [
-      "WARNING: Output file '/home/nikos/moveogithub/virtual-agent-analysis/data/Flows analysis - Agents Acc Issues BR_virtual_assistant.csv' already exists.Do you want to override it? (Y/n):  Y\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Output file '/home/nikos/moveogithub/virtual-agent-analysis/data/Flows analysis - Agents Acc Issues BR_virtual_assistant.csv' will be overwritten.\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "Splitting tickets: 100%|██████████████████████████████████████████████████████████████████████| 10925/10925 [00:00<00:00, 20296.11it/s]\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Split and saved DataFrame to CSV files.\n",
-      "Total rows in output files match total rows in input file.\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "split_agents_and_brain_tickets(\n",
-    "    \"Flows analysis - Agents Acc Issues BR.csv\", \"BR Virtual Assistant\"\n",
+    "    \"acc_issues_GR&CY.csv\", \"GR & CY & CZ & ON & NG Virtual Assistant\"\n",
     ")"
    ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
   }
  ],
  "metadata": {