srbhr · Pterjudin · Jan 8, 2025 · Jan 9, 2025 · Feb 16, 2025
diff --git a/README.md b/README.md
@@ -128,7 +128,7 @@ Follow these steps to set up the environment and run the application.
        pyenv install -v 3.11.0
      ```
 
-   - pyenv with virtual enviroment
+   - pyenv with virtual environment
      ```
         pyenv virtualenv 3.11.0 venv
      ```

diff --git a/requirements.txt b/requirements.txt
@@ -26,7 +26,7 @@ htbuilder==0.6.2
 idna==3.7
 importlib-metadata==6.8.0
 jellyfish==1.0.0
-Jinja2==3.1.4
+Jinja2==3.1.5
 joblib==1.3.1
 jsonschema==4.18.3
 jsonschema-specifications==2023.6.1

diff --git a/scripts/parsers/ParseJobDescToJson.py b/scripts/parsers/ParseJobDescToJson.py
@@ -1,6 +1,8 @@
 import json
 import os
 import pathlib
+import re
+from collections import defaultdict
 
 from scripts.Extractor import DataExtractor
 from scripts.KeytermsExtraction import KeytermExtractor
@@ -20,10 +22,38 @@ def __init__(self, job_desc: str):
         self.keyterms = KeytermExtractor(self.clean_data).get_keyterms_based_on_sgrank()
         self.bi_grams = KeytermExtractor(self.clean_data).bi_gramchunker()
         self.tri_grams = KeytermExtractor(self.clean_data).tri_gramchunker()
+        self.experience_sections = self.group_experience_sections()
+
+    def group_experience_sections(self):
+        """
+        Group the job description text based on experience segments like company names and responsibilities.
+        """
+        experience_dict = defaultdict(list)
+
+        # Regex pattern to detect work experience sections
+        experience_patterns = [
+            r"Worked at ([A-Za-z\s]+)",
+            r"Experience at ([A-Za-z\s]+)",
+            r"Previous role at ([A-Za-z\s]+)",
+        ]
+
+        lines = self.clean_data.split("\n")
+        current_company = None
+
+        for line in lines:
+            for pattern in experience_patterns:
+                match = re.search(pattern, line)
+                if match:
+                    current_company = match.group(1).strip()
+                    experience_dict[current_company] = []
+            if current_company:
+                experience_dict[current_company].append(line)
+
+        return {company: " ".join(details) for company, details in experience_dict.items()}
 
     def get_JSON(self) -> dict:
         """
-        Returns a dictionary of job description data.
+        Returns a dictionary of job description data with grouped experience sections.
         """
         job_desc_dictionary = {
             "unique_id": generate_unique_id(),
@@ -35,6 +65,7 @@ def get_JSON(self) -> dict:
             "bi_grams": str(self.bi_grams),
             "tri_grams": str(self.tri_grams),
             "pos_frequencies": self.pos_frequencies,
+            "experience_sections": self.experience_sections,
         }
 
-        return job_desc_dictionary
+        return job_desc_dictionary
diff --git a/scripts/parsers/ParseResumeToJson.py b/scripts/parsers/ParseResumeToJson.py
@@ -2,6 +2,8 @@
 import os
 import os.path
 import pathlib
+import re
+from collections import defaultdict
 
 from scripts.Extractor import DataExtractor
 from scripts.KeytermsExtraction import KeytermExtractor
@@ -17,7 +19,7 @@ def __init__(self, resume: str):
         self.clean_data = TextCleaner.clean_text(self.resume_data)
         self.entities = DataExtractor(self.clean_data).extract_entities()
         self.name = DataExtractor(self.clean_data[:30]).extract_names()
-        self.experience = DataExtractor(self.clean_data).extract_experience()
+        self.experience = self.extract_experience_sections()
         self.emails = DataExtractor(self.resume_data).extract_emails()
         self.phones = DataExtractor(self.resume_data).extract_phone_numbers()
         self.years = DataExtractor(self.clean_data).extract_position_year()
@@ -27,9 +29,50 @@ def __init__(self, resume: str):
         self.bi_grams = KeytermExtractor(self.clean_data).bi_gramchunker()
         self.tri_grams = KeytermExtractor(self.clean_data).tri_gramchunker()
 
+    def extract_experience_sections(self):
+        """
+        Extract and group job experience sections from the resume.
+        """
+        experience_dict = defaultdict(list)
+
+        # Common phrases indicating job roles
+        experience_patterns = [
+            r"(?P<role>.*) at (?P<company>[A-Za-z\s]+) \((?P<years>[0-9]{4}-[0-9]{4})\)",
+            r"(?P<role>.*) - (?P<company>[A-Za-z\s]+), (?P<years>[0-9]{4}-[0-9]{4})",
+            r"Worked at (?P<company>[A-Za-z\s]+)",
+        ]
+
+        lines = self.clean_data.split("\n")
+        current_company = None
+
+        for line in lines:
+            for pattern in experience_patterns:
+                match = re.search(pattern, line)
+                if match:
+                    company = match.group("company").strip()
+                    role = match.group("role").strip() if "role" in match.groupdict() else "Unknown Role"
+                    years = match.group("years").strip() if "years" in match.groupdict() else "N/A"
+                    current_company = company
+                    experience_dict[company] = {
+                        "role": role,
+                        "years": years,
+                        "details": [],
+                    }
+            if current_company:
+                experience_dict[current_company]["details"].append(line)
+
+        return {
+            company: {
+                "role": details["role"],
+                "years": details["years"],
+                "details": " ".join(details["details"]),
+            }
+            for company, details in experience_dict.items()
+        }
+
     def get_JSON(self) -> dict:
         """
-        Returns a dictionary of resume data.
+        Returns a dictionary of resume data with improved experience parsing.
         """
         resume_dictionary = {
             "unique_id": generate_unique_id(),
@@ -48,4 +91,4 @@ def get_JSON(self) -> dict:
             "pos_frequencies": self.pos_frequencies,
         }
 
-        return resume_dictionary
+        return resume_dictionary