Skip to content

Commit

Permalink
Merge pull request #193 from amosproj/dev
Browse files Browse the repository at this point in the history
sprint-09-release
  • Loading branch information
rbbozkurt authored Jan 10, 2024
2 parents 3a9dfc6 + e0d68f3 commit 59f1a76
Show file tree
Hide file tree
Showing 19 changed files with 1,431 additions and 259 deletions.
35 changes: 35 additions & 0 deletions Documentation/Google-Search-Strategy.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
<!--
SPDX-License-Identifier: MIT
SPDX-FileCopyrightText: 2023 Lucca Baumgärtner <lucca.baumgaertner@fau.de>
-->

# Google Search Strategy

## Introduction

In order to gather more information about a lead, we query the Google Places API. The API has
multiple endpoints, enabling different search method. To have the best chances at correctly identifying a lead
we try to combine the search methods and derive the most probable result.

### Available Lead Information

| First Name | Last Name | Phone Number | Email |
| ---------- | --------- | ------------- | ---------------------------- |
| Max | Muster | +491721234567 | max-muster@mybusiness.com |
| Melanie | Muster | +491322133321 | melanies-flowershop@gmail.nl |
| ... | ... | ... | ... |

### Available Search Methods

1. Fulltext Search
2. Phone Number Search

## Search Strategy

1. Phone Number Search 2) If there's a valid phone number, look it up
2. Email Based Search 3) If there's a custom domain, look it up 4) Else: Unless it contains the full name, look up the E-Mail account (everything before the `@` sing)
3. If Email-based Search returned any results, use those
4. Else: Return Phone-based search results
5. Else: Return nothing

![Search Strategy](./Media/google_places_strategy.drawio.png)
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
2 changes: 2 additions & 0 deletions Documentation/Media/google_places_strategy.drawio.png.license
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
SPDX-License-Identifier: MIT
SPDX-FileCopyrightText: 2023 Lucca Baumgärtner <lucca.baumgaertner@fau.de>
2 changes: 2 additions & 0 deletions Pipfile
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,8 @@ pyspellchecker = "==0.7.2"
autocorrect = "==2.6.1"
textblob = "==0.17.1"
deep-translator = "==1.11.4"
fsspec = "2023.12.2"
s3fs = "2023.12.2"

[requires]
python_version = "3.10"
812 changes: 649 additions & 163 deletions Pipfile.lock

Large diffs are not rendered by default.

29 changes: 18 additions & 11 deletions src/bdc/steps/regionalatlas.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,7 +106,6 @@ def run(self) -> DataFrame:
self.df[f"{self.name.lower()}_regional_score"] = self.df.progress_apply(
lambda lead: pd.Series(self.calculate_regional_score(lead)), axis=1
)

return self.df

def finish(self) -> None:
Expand Down Expand Up @@ -230,17 +229,25 @@ def calculate_regional_score(self, lead) -> float | None:
:return: float | None - The computed score if the necessary fields are present for the lead. None otherwise.
"""

if (
lead[f"{self.name.lower()}_pop_density"] is None
or lead[f"{self.name.lower()}_employment_rate"] is None
or lead[f"{self.name.lower()}_disp_income_p_inhabitant"] is None
):
return None
pop_density_col = f"{self.name.lower()}_pop_density"
employment_rate_col = f"{self.name.lower()}_employment_rate"
income_col = f"{self.name.lower()}_disp_income_p_inhabitant"

pop_density = lead[pop_density_col]
employment_rate = lead[employment_rate_col]
income_per_inhabitant = lead[income_col]

pop_density = pop_density if pd.notnull(pop_density) else 0
employment_rate = employment_rate if pd.notnull(employment_rate) else 0
income_per_inhabitant = (
income_per_inhabitant if pd.notnull(income_per_inhabitant) else 0
)

regional_score = (
lead[f"{self.name.lower()}_pop_density"]
* lead[f"{self.name.lower()}_employment_rate"]
* lead[f"{self.name.lower()}_disp_income_p_inhabitant"]
pop_density * employment_rate * income_per_inhabitant
) / 1000000

return regional_score
if pd.notnull(regional_score):
return regional_score
else:
raise ValueError("Regional score is null")
5 changes: 5 additions & 0 deletions src/demo/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
# SPDX-License-Identifier: MIT
# SPDX-FileCopyrightText: 2023 Berkay Bozkurt <resitberkaybozkurt@gmail.com>

from .console_utils import get_int_input, get_multiple_choice, get_yes_no_input
from .demos import bdc_demo, db_demo, evp_demo, pipeline_demo, preprocessing_demo
76 changes: 76 additions & 0 deletions src/demo/console_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
# SPDX-License-Identifier: MIT
# SPDX-FileCopyrightText: 2023 Berkay Bozkurt <resitberkaybozkurt@gmail.com>


# Utility Functions
def get_yes_no_input(prompt: str) -> bool:
"""
Prompts the user with a given prompt and returns True if the user enters 'yes' or 'y',
and False if the user enters 'no' or 'n'. The input is case-insensitive.
Args:
prompt (str): The prompt to display to the user.
Returns:
bool: True if the user enters 'yes' or 'y', False if the user enters 'no' or 'n'.
"""
while True:
user_input = input(prompt).strip().lower()
if user_input in ["y", "yes"]:
return True
elif user_input in ["n", "no"]:
return False
else:
print("Invalid input. Please enter (yes/no) or (y/N).")


def get_int_input(prompt: str, input_range: range = None) -> int:
"""
Prompts the user for an integer input and validates it.
Args:
prompt (str): The prompt message to display to the user.
input_range (range, optional): The range of valid input values. Defaults to None.
Returns:
int: The validated integer input.
Raises:
ValueError: If the input is not a valid integer.
"""
while True:
try:
input_int = int(input(prompt))
if input_range is not None and input_int not in input_range:
print("Invalid input. Please enter a valid integer.")
continue
else:
return input_int
except ValueError:
print("Invalid input. Please enter a valid integer.")


def get_multiple_choice(prompt: str, choices: list) -> str:
"""
Prompts the user with a message and a list of choices, and returns the selected choice.
Args:
prompt (str): The message to display to the user.
choices (list): The list of choices to display to the user.
Returns:
str: The selected choice.
Raises:
ValueError: If the user enters an invalid input.
"""
while True:
try:
prompt += "".join(
f"({index}) : {choice} \n" for index, choice in enumerate(choices)
)
ind = get_int_input(prompt, range(len(choices) + 1))
return choices[ind]
except ValueError:
print("Invalid input. Please enter a valid integer.")
152 changes: 81 additions & 71 deletions src/demos.py → src/demo/demos.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,28 +6,25 @@
# SPDX-FileCopyrightText: 2023 Ruchita Nathani <Ruchita.nathani@fau.de>
# SPDX-FileCopyrightText: 2023 Ahmed Sheta <ahmed.sheta@fau.de>


from sklearn.metrics import mean_squared_error

from bdc import DataCollector
from bdc.pipeline import Pipeline
from bdc.steps import (
AnalyzeEmails,
FacebookGraphAPI,
GooglePlaces,
GooglePlacesDetailed,
GPTReviewSentimentAnalyzer,
GPTSummarizer,
PreprocessPhonenumbers,
RegionalAtlas,
ScrapeAddress,
SmartReviewInsightsEnhancer,
)
from database import get_database
from database.parsers import LeadParser
from demo.console_utils import get_int_input, get_multiple_choice, get_yes_no_input
from demo.pipeline_utils import (
get_all_available_pipeline_json_configs,
get_pipeline_additional_steps,
get_pipeline_config_from_json,
get_pipeline_initial_steps,
)
from evp import EstimatedValuePredictor
from evp.data_processing import split_dataset
from evp.predictors import Predictors
from logger import get_logger
from preprocessing import Preprocessing

log = get_logger()

Expand All @@ -38,31 +35,11 @@
OUTPUT_FILE_BDC = "../data/collected_data.json"


# Utility Functions
def get_yes_no_input(prompt: str) -> bool:
while True:
user_input = input(prompt).strip().lower()
if user_input in ["y", "yes"]:
return True
elif user_input in ["n", "no"]:
return False
else:
print("Invalid input. Please enter (yes/no) or (y/N).")


def get_int_input(prompt: str) -> int:
while True:
try:
return int(input(prompt))
except ValueError:
print("Invalid input. Please enter a valid integer.")


# bdc_demo
def bdc_demo():
dc = DataCollector()
try:
choice = get_int_input("(1) Read CSV\n(2) Dummy API\n")
choice = get_int_input("(1) Read CSV\n(2) Dummy API\n", range(1, 3))
if choice == 1:
dc.get_data_from_csv(file_path=INPUT_FILE_BDC)
elif choice == 2:
Expand Down Expand Up @@ -94,7 +71,8 @@ def evp_demo():

while True:
choice = get_int_input(
"(1) Train\n(2) Test\n(3) Predict on single lead\n(4) Save model\n(5) Exit\n"
"(1) Train\n(2) Test\n(3) Predict on single lead\n(4) Save model\n(5) Exit\n",
range(1, 6),
)
if choice == 1:
evp.train(LEADS_TRAIN_FILE)
Expand All @@ -121,7 +99,9 @@ def test_evp_model(evp: EstimatedValuePredictor):

def predict_single_lead(evp: EstimatedValuePredictor):
leads = LeadParser.parse_leads_from_csv(LEADS_TEST_FILE)
lead_id = get_int_input(f"Choose a lead_id in range [0, {len(leads) - 1}]\n")
lead_id = get_int_input(
f"Choose a lead_id in range [0, {len(leads) - 1}]\n", range(len(leads))
)
if 0 <= lead_id < len(leads):
prediction = evp.estimate_value(leads[lead_id])
print(
Expand All @@ -134,54 +114,66 @@ def predict_single_lead(evp: EstimatedValuePredictor):
# db_demo
def db_demo():
amt_leads = get_database().get_cardinality()
lead_id = get_int_input(f"Choose a lead_id in range [1, {amt_leads}]\n")
lead_id = get_int_input(
f"Choose a lead_id in range [1, {amt_leads}]\n", range(1, amt_leads + 1)
)
if 1 <= lead_id <= amt_leads:
print(get_database().get_lead_by_id(lead_id))
else:
print("Invalid Choice")


def add_step_if_requested(
steps, step_classes, step_desc, step_warning_message: str = ""
):
def add_step_if_requested(steps, step_class, step_desc, step_warning_message: str = ""):
if get_yes_no_input(f"Run {step_desc} {step_warning_message}(y/N)?\n"):
force = get_yes_no_input("Force execution if data is present? (y/N)\n")
for cls in step_classes:
steps.append(cls(force_refresh=force))
steps.append(step_class(force_refresh=force))


# pipeline_demo
def pipeline_demo():
steps = [AnalyzeEmails(force_refresh=True)]
additional_steps = [
([ScrapeAddress], "Scrape Address", "(will take a long time)"),
([FacebookGraphAPI], "Facebook Graph API", "(will use token)"),
([PreprocessPhonenumbers], "Phone Number Validation", ""),
(
[GooglePlaces, GooglePlacesDetailed],
"Google API",
"(will use token and generate cost!)",
),
(
[GPTReviewSentimentAnalyzer, GPTSummarizer],
"open API Sentiment Analyzer & Summarizer",
"(will use token and generate cost!)",
),
(
[SmartReviewInsightsEnhancer],
"Smart Review Insights",
"(will take looong time!)",
),
([RegionalAtlas], "Regionalatlas", ""),
]

if get_yes_no_input(f"Run Demo pipeline with all steps (y/N)?\n"):
for step_classes, _, _ in additional_steps:
step_instances = [cls(force_refresh=True) for cls in step_classes]
steps.extend(step_instances)
else:
for step_classes, desc, warning_message in additional_steps:
add_step_if_requested(steps, step_classes, desc, warning_message)
"""
Demonstrates the execution of a pipeline.
The function prompts the user to select a pipeline configuration or create a custom one.
It then sets a limit for the number of data points to be processed, if specified.
Finally, it runs the pipeline with the selected configuration and limit.
Args:
None
Returns:
None
"""
continue_with_custom_config = True
if get_yes_no_input(f"Do you want to list all available pipeline configs? (y/N)\n"):
# Create the formatted string using list comprehension and join
all_pipeline_configs = get_all_available_pipeline_json_configs()
if len(all_pipeline_configs) > 0:
prompt = "Please enter the index of requested pipeline config:\n"
choices = all_pipeline_configs + ["Exit"]
choice = get_multiple_choice(prompt, choices)
if choice != "Exit":
steps = get_pipeline_config_from_json(config_name=choice)
continue_with_custom_config = False
else:
print("Exiting...\n")
else:
print("No pipeline configs found.\n")

if continue_with_custom_config:
print("Continuing with custom pipeline config...\n\n")
steps = []
# get default steps and optional steps attrs
initial_steps_attr = get_pipeline_initial_steps()
additional_steps_attr = get_pipeline_additional_steps()

# create step instances from default steps attrs and add them to steps list
for step_class, desc, warning_message in initial_steps_attr:
steps.append(step_class(force_refresh=True))

# add optional steps to steps list if requested
for step_class, desc, warning_message in additional_steps_attr:
add_step_if_requested(steps, step_class, desc, warning_message)

limit = get_int_input("Set limit for data points to be processed (0=No limit)\n")
limit = limit if limit > 0 else None
Expand All @@ -208,3 +200,21 @@ def pipeline_demo():
)

pipeline.run()


def preprocessing_demo():
if get_yes_no_input("Filter out the API-irrelevant data? (y/n)"):
filter_bool = True
else:
filter_bool = False
if get_yes_no_input(
"Run on historical data ? (y/n)\nNote: DATABASE_TYPE should be S3!"
):
historical_bool = True
else:
historical_bool = False
preprocessor = Preprocessing(
filter_null_data=filter_bool, historical_data=historical_bool
)
df = preprocessor.implement_preprocessing_pipeline()
preprocessor.save_preprocessed_data()
Loading

0 comments on commit 59f1a76

Please sign in to comment.