Merge pull request #193 from amosproj/dev

sprint-09-release
amosproj · Jan 10, 2024 · 59f1a76 · 59f1a76
2 parents 3a9dfc6 + e0d68f3
commit 59f1a76
Show file tree

Hide file tree

Showing 19 changed files with 1,431 additions and 259 deletions.
diff --git a/Documentation/Google-Search-Strategy.md b/Documentation/Google-Search-Strategy.md
@@ -0,0 +1,35 @@
+<!--
+SPDX-License-Identifier: MIT
+SPDX-FileCopyrightText: 2023 Lucca Baumgärtner <lucca.baumgaertner@fau.de>
+-->
+
+# Google Search Strategy
+
+## Introduction
+
+In order to gather more information about a lead, we query the Google Places API. The API has
+multiple endpoints, enabling different search method. To have the best chances at correctly identifying a lead
+we try to combine the search methods and derive the most probable result.
+
+### Available Lead Information
+
+| First Name | Last Name | Phone Number  | Email                        |
+| ---------- | --------- | ------------- | ---------------------------- |
+| Max        | Muster    | +491721234567 | max-muster@mybusiness.com    |
+| Melanie    | Muster    | +491322133321 | melanies-flowershop@gmail.nl |
+| ...        | ...       | ...           | ...                          |
+
+### Available Search Methods
+
+1. Fulltext Search
+2. Phone Number Search
+
+## Search Strategy
+
+1. Phone Number Search 2) If there's a valid phone number, look it up
+2. Email Based Search 3) If there's a custom domain, look it up 4) Else: Unless it contains the full name, look up the E-Mail account (everything before the `@` sing)
+3. If Email-based Search returned any results, use those
+4. Else: Return Phone-based search results
+5. Else: Return nothing
+
+![Search Strategy](./Media/google_places_strategy.drawio.png)
diff --git a/Documentation/Media/google_places_strategy.drawio.png b/Documentation/Media/google_places_strategy.drawio.png
diff --git a/Documentation/Media/google_places_strategy.drawio.png.license b/Documentation/Media/google_places_strategy.drawio.png.license
@@ -0,0 +1,2 @@
+SPDX-License-Identifier: MIT
+SPDX-FileCopyrightText: 2023 Lucca Baumgärtner <lucca.baumgaertner@fau.de>
diff --git a/Pipfile b/Pipfile
@@ -38,6 +38,8 @@ pyspellchecker = "==0.7.2"
 autocorrect = "==2.6.1"
 textblob = "==0.17.1"
 deep-translator = "==1.11.4"
+fsspec = "2023.12.2"
+s3fs = "2023.12.2"
 
 [requires]
 python_version = "3.10"
diff --git a/Pipfile.lock b/Pipfile.lock
diff --git a/src/bdc/steps/regionalatlas.py b/src/bdc/steps/regionalatlas.py
@@ -106,7 +106,6 @@ def run(self) -> DataFrame:
         self.df[f"{self.name.lower()}_regional_score"] = self.df.progress_apply(
             lambda lead: pd.Series(self.calculate_regional_score(lead)), axis=1
         )
-
         return self.df
 
     def finish(self) -> None:
@@ -230,17 +229,25 @@ def calculate_regional_score(self, lead) -> float | None:
         :return: float | None - The computed score if the necessary fields are present for the lead. None otherwise.
         """
 
-        if (
-            lead[f"{self.name.lower()}_pop_density"] is None
-            or lead[f"{self.name.lower()}_employment_rate"] is None
-            or lead[f"{self.name.lower()}_disp_income_p_inhabitant"] is None
-        ):
-            return None
+        pop_density_col = f"{self.name.lower()}_pop_density"
+        employment_rate_col = f"{self.name.lower()}_employment_rate"
+        income_col = f"{self.name.lower()}_disp_income_p_inhabitant"
+
+        pop_density = lead[pop_density_col]
+        employment_rate = lead[employment_rate_col]
+        income_per_inhabitant = lead[income_col]
+
+        pop_density = pop_density if pd.notnull(pop_density) else 0
+        employment_rate = employment_rate if pd.notnull(employment_rate) else 0
+        income_per_inhabitant = (
+            income_per_inhabitant if pd.notnull(income_per_inhabitant) else 0
+        )
 
         regional_score = (
-            lead[f"{self.name.lower()}_pop_density"]
-            * lead[f"{self.name.lower()}_employment_rate"]
-            * lead[f"{self.name.lower()}_disp_income_p_inhabitant"]
+            pop_density * employment_rate * income_per_inhabitant
         ) / 1000000
 
-        return regional_score
+        if pd.notnull(regional_score):
+            return regional_score
+        else:
+            raise ValueError("Regional score is null")
diff --git a/src/demo/__init__.py b/src/demo/__init__.py
@@ -0,0 +1,5 @@
+# SPDX-License-Identifier: MIT
+# SPDX-FileCopyrightText: 2023 Berkay Bozkurt <resitberkaybozkurt@gmail.com>
+
+from .console_utils import get_int_input, get_multiple_choice, get_yes_no_input
+from .demos import bdc_demo, db_demo, evp_demo, pipeline_demo, preprocessing_demo
diff --git a/src/demo/console_utils.py b/src/demo/console_utils.py
@@ -0,0 +1,76 @@
+# SPDX-License-Identifier: MIT
+# SPDX-FileCopyrightText: 2023 Berkay Bozkurt <resitberkaybozkurt@gmail.com>
+
+
+# Utility Functions
+def get_yes_no_input(prompt: str) -> bool:
+    """
+    Prompts the user with a given prompt and returns True if the user enters 'yes' or 'y',
+    and False if the user enters 'no' or 'n'. The input is case-insensitive.
+
+    Args:
+        prompt (str): The prompt to display to the user.
+
+    Returns:
+        bool: True if the user enters 'yes' or 'y', False if the user enters 'no' or 'n'.
+    """
+    while True:
+        user_input = input(prompt).strip().lower()
+        if user_input in ["y", "yes"]:
+            return True
+        elif user_input in ["n", "no"]:
+            return False
+        else:
+            print("Invalid input. Please enter (yes/no) or (y/N).")
+
+
+def get_int_input(prompt: str, input_range: range = None) -> int:
+    """
+    Prompts the user for an integer input and validates it.
+
+    Args:
+        prompt (str): The prompt message to display to the user.
+        input_range (range, optional): The range of valid input values. Defaults to None.
+
+    Returns:
+        int: The validated integer input.
+
+    Raises:
+        ValueError: If the input is not a valid integer.
+
+    """
+    while True:
+        try:
+            input_int = int(input(prompt))
+            if input_range is not None and input_int not in input_range:
+                print("Invalid input. Please enter a valid integer.")
+                continue
+            else:
+                return input_int
+        except ValueError:
+            print("Invalid input. Please enter a valid integer.")
+
+
+def get_multiple_choice(prompt: str, choices: list) -> str:
+    """
+    Prompts the user with a message and a list of choices, and returns the selected choice.
+
+    Args:
+        prompt (str): The message to display to the user.
+        choices (list): The list of choices to display to the user.
+
+    Returns:
+        str: The selected choice.
+
+    Raises:
+        ValueError: If the user enters an invalid input.
+    """
+    while True:
+        try:
+            prompt += "".join(
+                f"({index}) : {choice} \n" for index, choice in enumerate(choices)
+            )
+            ind = get_int_input(prompt, range(len(choices) + 1))
+            return choices[ind]
+        except ValueError:
+            print("Invalid input. Please enter a valid integer.")
diff --git a/src/demos.py → src/demo/demos.py b/src/demos.py → src/demo/demos.py
@@ -6,28 +6,25 @@
 # SPDX-FileCopyrightText: 2023 Ruchita Nathani <Ruchita.nathani@fau.de>
 # SPDX-FileCopyrightText: 2023 Ahmed Sheta <ahmed.sheta@fau.de>
 
+
 from sklearn.metrics import mean_squared_error
 
 from bdc import DataCollector
 from bdc.pipeline import Pipeline
-from bdc.steps import (
-    AnalyzeEmails,
-    FacebookGraphAPI,
-    GooglePlaces,
-    GooglePlacesDetailed,
-    GPTReviewSentimentAnalyzer,
-    GPTSummarizer,
-    PreprocessPhonenumbers,
-    RegionalAtlas,
-    ScrapeAddress,
-    SmartReviewInsightsEnhancer,
-)
 from database import get_database
 from database.parsers import LeadParser
+from demo.console_utils import get_int_input, get_multiple_choice, get_yes_no_input
+from demo.pipeline_utils import (
+    get_all_available_pipeline_json_configs,
+    get_pipeline_additional_steps,
+    get_pipeline_config_from_json,
+    get_pipeline_initial_steps,
+)
 from evp import EstimatedValuePredictor
 from evp.data_processing import split_dataset
 from evp.predictors import Predictors
 from logger import get_logger
+from preprocessing import Preprocessing
 
 log = get_logger()
 
@@ -38,31 +35,11 @@
 OUTPUT_FILE_BDC = "../data/collected_data.json"
 
 
-# Utility Functions
-def get_yes_no_input(prompt: str) -> bool:
-    while True:
-        user_input = input(prompt).strip().lower()
-        if user_input in ["y", "yes"]:
-            return True
-        elif user_input in ["n", "no"]:
-            return False
-        else:
-            print("Invalid input. Please enter (yes/no) or (y/N).")
-
-
-def get_int_input(prompt: str) -> int:
-    while True:
-        try:
-            return int(input(prompt))
-        except ValueError:
-            print("Invalid input. Please enter a valid integer.")
-
-
 # bdc_demo
 def bdc_demo():
     dc = DataCollector()
     try:
-        choice = get_int_input("(1) Read CSV\n(2) Dummy API\n")
+        choice = get_int_input("(1) Read CSV\n(2) Dummy API\n", range(1, 3))
         if choice == 1:
             dc.get_data_from_csv(file_path=INPUT_FILE_BDC)
         elif choice == 2:
@@ -94,7 +71,8 @@ def evp_demo():
 
     while True:
         choice = get_int_input(
-            "(1) Train\n(2) Test\n(3) Predict on single lead\n(4) Save model\n(5) Exit\n"
+            "(1) Train\n(2) Test\n(3) Predict on single lead\n(4) Save model\n(5) Exit\n",
+            range(1, 6),
         )
         if choice == 1:
             evp.train(LEADS_TRAIN_FILE)
@@ -121,7 +99,9 @@ def test_evp_model(evp: EstimatedValuePredictor):
 
 def predict_single_lead(evp: EstimatedValuePredictor):
     leads = LeadParser.parse_leads_from_csv(LEADS_TEST_FILE)
-    lead_id = get_int_input(f"Choose a lead_id in range [0, {len(leads) - 1}]\n")
+    lead_id = get_int_input(
+        f"Choose a lead_id in range [0, {len(leads) - 1}]\n", range(len(leads))
+    )
     if 0 <= lead_id < len(leads):
         prediction = evp.estimate_value(leads[lead_id])
         print(
@@ -134,54 +114,66 @@ def predict_single_lead(evp: EstimatedValuePredictor):
 # db_demo
 def db_demo():
     amt_leads = get_database().get_cardinality()
-    lead_id = get_int_input(f"Choose a lead_id in range [1, {amt_leads}]\n")
+    lead_id = get_int_input(
+        f"Choose a lead_id in range [1, {amt_leads}]\n", range(1, amt_leads + 1)
+    )
     if 1 <= lead_id <= amt_leads:
         print(get_database().get_lead_by_id(lead_id))
     else:
         print("Invalid Choice")
 
 
-def add_step_if_requested(
-    steps, step_classes, step_desc, step_warning_message: str = ""
-):
+def add_step_if_requested(steps, step_class, step_desc, step_warning_message: str = ""):
     if get_yes_no_input(f"Run {step_desc} {step_warning_message}(y/N)?\n"):
         force = get_yes_no_input("Force execution if data is present? (y/N)\n")
-        for cls in step_classes:
-            steps.append(cls(force_refresh=force))
+        steps.append(step_class(force_refresh=force))
 
 
 # pipeline_demo
 def pipeline_demo():
-    steps = [AnalyzeEmails(force_refresh=True)]
-    additional_steps = [
-        ([ScrapeAddress], "Scrape Address", "(will take a long time)"),
-        ([FacebookGraphAPI], "Facebook Graph API", "(will use token)"),
-        ([PreprocessPhonenumbers], "Phone Number Validation", ""),
-        (
-            [GooglePlaces, GooglePlacesDetailed],
-            "Google API",
-            "(will use token and generate cost!)",
-        ),
-        (
-            [GPTReviewSentimentAnalyzer, GPTSummarizer],
-            "open API Sentiment Analyzer & Summarizer",
-            "(will use token and generate cost!)",
-        ),
-        (
-            [SmartReviewInsightsEnhancer],
-            "Smart Review Insights",
-            "(will take looong time!)",
-        ),
-        ([RegionalAtlas], "Regionalatlas", ""),
-    ]
-
-    if get_yes_no_input(f"Run Demo pipeline with all steps (y/N)?\n"):
-        for step_classes, _, _ in additional_steps:
-            step_instances = [cls(force_refresh=True) for cls in step_classes]
-            steps.extend(step_instances)
-    else:
-        for step_classes, desc, warning_message in additional_steps:
-            add_step_if_requested(steps, step_classes, desc, warning_message)
+    """
+    Demonstrates the execution of a pipeline.
+
+    The function prompts the user to select a pipeline configuration or create a custom one.
+    It then sets a limit for the number of data points to be processed, if specified.
+    Finally, it runs the pipeline with the selected configuration and limit.
+
+    Args:
+        None
+
+    Returns:
+        None
+    """
+    continue_with_custom_config = True
+    if get_yes_no_input(f"Do you want to list all available pipeline configs? (y/N)\n"):
+        # Create the formatted string using list comprehension and join
+        all_pipeline_configs = get_all_available_pipeline_json_configs()
+        if len(all_pipeline_configs) > 0:
+            prompt = "Please enter the index of requested pipeline config:\n"
+            choices = all_pipeline_configs + ["Exit"]
+            choice = get_multiple_choice(prompt, choices)
+            if choice != "Exit":
+                steps = get_pipeline_config_from_json(config_name=choice)
+                continue_with_custom_config = False
+            else:
+                print("Exiting...\n")
+        else:
+            print("No pipeline configs found.\n")
+
+    if continue_with_custom_config:
+        print("Continuing with custom pipeline config...\n\n")
+        steps = []
+        # get default steps and optional steps attrs
+        initial_steps_attr = get_pipeline_initial_steps()
+        additional_steps_attr = get_pipeline_additional_steps()
+
+        # create step instances from default steps attrs and add them to steps list
+        for step_class, desc, warning_message in initial_steps_attr:
+            steps.append(step_class(force_refresh=True))
+
+        # add optional steps to steps list if requested
+        for step_class, desc, warning_message in additional_steps_attr:
+            add_step_if_requested(steps, step_class, desc, warning_message)
 
     limit = get_int_input("Set limit for data points to be processed (0=No limit)\n")
     limit = limit if limit > 0 else None
@@ -208,3 +200,21 @@ def pipeline_demo():
     )
 
     pipeline.run()
+
+
+def preprocessing_demo():
+    if get_yes_no_input("Filter out the API-irrelevant data? (y/n)"):
+        filter_bool = True
+    else:
+        filter_bool = False
+    if get_yes_no_input(
+        "Run on historical data ? (y/n)\nNote: DATABASE_TYPE should be S3!"
+    ):
+        historical_bool = True
+    else:
+        historical_bool = False
+    preprocessor = Preprocessing(
+        filter_null_data=filter_bool, historical_data=historical_bool
+    )
+    df = preprocessor.implement_preprocessing_pipeline()
+    preprocessor.save_preprocessed_data()
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		SPDX-License-Identifier: MIT
		SPDX-FileCopyrightText: 2023 Lucca Baumgärtner <lucca.baumgaertner@fau.de>