update CA question import

struan · struan · commit 43050182eb3e · 2024-05-21T18:02:44.000+01:00
diff --git a/crowdsourcer/management/commands/import_combined_authority_questions.py b/crowdsourcer/management/commands/import_combined_authority_questions.py
@@ -5,14 +5,12 @@
 
 import pandas as pd
 
-from crowdsourcer.models import Option, Question, QuestionGroup, Section
+from crowdsourcer.models import MarkingSession, Option, Question, QuestionGroup, Section
 
 
 class Command(BaseCommand):
     help = "import questions"
 
-    question_file = settings.BASE_DIR / "data" / "combined_authority_questions.xlsx"
-
     column_names = [
         "question_no",
         "topic",
@@ -29,34 +27,129 @@ class Command(BaseCommand):
 
     # get round limits on length of sheet names
     sheet_map = {
-        "Buildings & Heating & Green Skills": "Buildings & Heating & Green Ski",
+        "Buildings & Heating & Green Skills (CA)": "B&H CA",
+        "Collaboration & Engagement (CA)": "C&E CA",
+        "Governance & Finance (CA)": "G&F CA",
     }
 
     def add_arguments(self, parser):
         parser.add_argument(
             "-q", "--quiet", action="store_true", help="Silence progress bars."
         )
 
+        parser.add_argument(
+            "--file", action="store", help="Excel file containing the questions"
+        )
+
+        parser.add_argument(
+            "--session", action="store", help="Marking session to use questions with"
+        )
+
         parser.add_argument(
             "--text_only",
             action="store_true",
             help="Only update question text, criteria and clarifications",
         )
+        parser.add_argument(
+            "--column_list", action="store", help="file with list of column names"
+        )
+
+    def get_column_names(self, **kwargs):
+        column_list = kwargs.get("column_list", None)
+        column_list = settings.BASE_DIR / "data" / column_list
+        if not column_list.exists():
+            self.stderr.write(
+                f"file does not exist: {column_list}, using standard columns"
+            )
+            return
+
+        if column_list is not None:
+            df = pd.read_csv(settings.BASE_DIR / "data" / column_list)
+            columns = []
+            for _, row in df.iterrows():
+                columns.append(row["Column"])
+            self.column_names = columns
 
     def handle(self, quiet: bool = False, *args, **kwargs):
-        group = QuestionGroup.objects.get(description="Combined Authority")
+        file = kwargs.get("file", None)
+
+        if file is None:
+            self.stderr.write("please supply a file name")
+            return
+
+        self.question_file = settings.BASE_DIR / "data" / file
 
-        for section in Section.objects.filter(title__contains="(CA)"):
-            title = section.title.replace(" (CA)", "")
+        session_label = kwargs.get("session", None)
+        try:
+            session = MarkingSession.objects.get(label=session_label)
+        except MarkingSession.DoesNotExist:
+            self.stderr.write(f"No session with that name: {session_label}")
+            return
+
+        group = QuestionGroup.objects.get(
+            description="Combined Authority", marking_session=session
+        )
+
+        self.get_column_names(**kwargs)
+
+        for section in Section.objects.filter(
+            marking_session=session, title__contains="(CA)"
+        ):
+            header = 2
+            sheet_name = self.sheet_map.get(section.title, section.title)
+            print(sheet_name)
             df = pd.read_excel(
                 self.question_file,
-                sheet_name=self.sheet_map.get(title, title),
-                header=2,
-                # remove blank and hidden notes columns
+                sheet_name=sheet_name,
+            )
+
+            if "Question" in df.columns:
+                header = 0
+            else:
+                found_header = False
+                for index, row in df.iterrows():
+                    for i in [2, 3]:
+                        q_cell = row.iat[i]
+                        if type(q_cell) == str and q_cell.strip() == "Question":
+                            header = index + 1
+                            found_header = True
+                            break
+
+                    if found_header:
+                        break
+
+                    if index > 5:
+                        print(f"Did not find header in {section}")
+                        break
+
+            df = pd.read_excel(
+                self.question_file,
+                sheet_name=sheet_name,
+                header=header,
                 usecols=lambda name: name != "Notes" and "Unnamed" not in name,
             )
 
             df = df.dropna(axis="index", how="all")
+            drop_cols = [
+                "Climate Justice/Adaptation Tag",
+                "Drop down box options for no mark awarded (internal)",
+                "Is this question or criteria changing?",
+                "Change proposed",
+                "New Criteria",
+                "Clarifications",
+                "2023 Scorecards Criteria",
+                "2023 Scorecards Clarifications",
+                "2023 Criteria",
+                "2023 Clarifications",
+                "Previous Criteria from 2023 Scorecards",
+                "Type",
+                "Edits",
+                "Total Points Available when weighted",
+                "Weighting",
+            ]
+            for col in drop_cols:
+                if col in df.columns:
+                    df = df.drop(col, axis=1)
 
             columns = list(self.column_names)
             options = len(df.columns) - len(self.column_names) + 1
@@ -69,6 +162,9 @@ def handle(self, quiet: bool = False, *args, **kwargs):
                 if pd.isna(row["question_no"]):
                     continue
 
+                if pd.isna(row["question"]):
+                    continue
+
                 q_no = str(row["question_no"])
                 q_part = None
                 if pd.isna(q_no):
@@ -86,7 +182,10 @@ def handle(self, quiet: bool = False, *args, **kwargs):
                     if row["how_marked"] == "FOI":
                         how_marked = "foi"
                         question_type = "foi"
-                    elif "National Data" in row["how_marked"]:
+                    elif (
+                        "National Data" in row["how_marked"]
+                        or "National data" in row["how_marked"]
+                    ):
                         how_marked = "national_data"
                         question_type = "national_data"
 
@@ -105,7 +204,7 @@ def handle(self, quiet: bool = False, *args, **kwargs):
                             pass
                         else:
                             print(
-                                f"missing question type: {title}, {row['question_no']} - {row['question_type']}"
+                                f"missing question type: {section.title}, {row['question_no']} - {row['question_type']}"
                             )
                             continue