refactor: [FC-0063] Block type processors are integrated into the scr…

…ipt workflow
raccoongang · Jan 15, 2025 · a4e158e · a4e158e
1 parent 2ab2231
commit a4e158e
Show file tree

Hide file tree

Showing 10 changed files with 111 additions and 1,184 deletions.
diff --git a/setup.py b/setup.py
@@ -25,7 +25,7 @@
         "Programming Language :: Python :: 3.8",
         "Topic :: Utilities",
     ],
-    description=("Command line tool, that converts Common Cartridge " "courses to Open edX Studio imports."),
+    description="Command line tool, that converts Common Cartridge courses to Open edX Studio imports.",
     entry_points={"console_scripts": ["cc2olx=cc2olx.main:main"]},
     install_requires=load_requirements("requirements/base.in"),
     license="GNU Affero General Public License",

diff --git a/src/cc2olx/constants.py b/src/cc2olx/constants.py
@@ -2,7 +2,7 @@
 OLX_STATIC_PATH_TEMPLATE = f"/{OLX_STATIC_DIR}/{{static_filename}}"
 WEB_RESOURCES_DIR_NAME = "web_resources"
 
-LINK_HTML = "<a href='{url}'>{text}</a>"
+LINK_HTML = '<a href="{url}">{text}</a>'
 YOUTUBE_LINK_PATTERN = r"youtube.com/watch\?v=(?P<video_id>[-\w]+)"
 CDATA_PATTERN = r"<!\[CDATA\[(?P<content>.*?)\]\]>"
 

diff --git a/src/cc2olx/django_settings.py b/src/cc2olx/django_settings.py
diff --git a/src/cc2olx/filesystem.py b/src/cc2olx/filesystem.py
@@ -3,9 +3,9 @@
 import zipfile
 
 from xml.etree import ElementTree
-from lxml import etree
 
 from cc2olx.utils import clean_file_name
+from cc2olx.xml.cc_xml import CommonCartridgeXmlParser
 
 logger = logging.getLogger()
 
@@ -32,7 +32,7 @@ def get_xml_tree(path_src):
         # We are using this parser with recover and encoding options so that we are
         # able to parse malformed xml without much issue. The xml that we are
         # anticipating can even be having certain non-acceptable characters like &nbsp.
-        parser = etree.XMLParser(encoding="utf-8", recover=True, ns_clean=True)
+        parser = CommonCartridgeXmlParser(encoding="utf-8", recover=True, ns_clean=True)
         tree = ElementTree.parse(str(path_src), parser=parser)
         return tree
     except ElementTree.ParseError:

diff --git a/src/cc2olx/main.py b/src/cc2olx/main.py
@@ -6,13 +6,13 @@
 from pathlib import Path
 
 import django
+from django.conf import settings
 
-from cc2olx import filesystem
-from cc2olx import olx
+from cc2olx import filesystem, olx
 from cc2olx.cli import parse_args, RESULT_TYPE_FOLDER, RESULT_TYPE_ZIP
 from cc2olx.constants import OLX_STATIC_DIR
 from cc2olx.models import Cartridge
-from cc2olx.settings import collect_settings
+from cc2olx.parser import parse_options
 
 
 def convert_one_file(
@@ -58,23 +58,22 @@ def convert_one_file(
 def main():
     initialize_django()
 
-    parsed_args = parse_args()
-    settings = collect_settings(parsed_args)
+    args = parse_args()
+    options = parse_options(args)
 
-    workspace = settings["workspace"]
-    link_file = settings["link_file"]
-    passport_file = settings["passport_file"]
-    relative_links_source = settings["relative_links_source"]
+    workspace = options["workspace"]
+    link_file = options["link_file"]
+    passport_file = options["passport_file"]
+    relative_links_source = options["relative_links_source"]
 
     # setup logger
-    logging_config = settings["logging_config"]
-    logging.basicConfig(level=logging_config["level"], format=logging_config["format"])
+    logging.basicConfig(level=options["log_level"], format=settings.LOG_FORMAT)
     logger = logging.getLogger()
 
     with tempfile.TemporaryDirectory() as tmpdirname:
         temp_workspace = Path(tmpdirname) / workspace.stem
 
-        for input_file in settings["input_files"]:
+        for input_file in options["input_files"]:
             try:
                 convert_one_file(
                     input_file,
@@ -83,15 +82,14 @@ def main():
                     passport_file,
                     relative_links_source,
                 )
-
             except Exception:
                 logger.exception("Error while converting %s file", input_file)
 
-        if settings["output_format"] == RESULT_TYPE_FOLDER:
+        if options["output_format"] == RESULT_TYPE_FOLDER:
             shutil.rmtree(str(workspace), ignore_errors=True)
             shutil.copytree(str(temp_workspace), str(workspace))
 
-        if settings["output_format"] == RESULT_TYPE_ZIP:
+        if options["output_format"] == RESULT_TYPE_ZIP:
             shutil.make_archive(str(workspace), "zip", str(temp_workspace))
 
     logger.info("Conversion completed")
@@ -103,7 +101,7 @@ def initialize_django():
     """
     Initialize the Django package.
     """
-    os.environ.setdefault("DJANGO_SETTINGS_MODULE", "cc2olx.django_settings")
+    os.environ.setdefault("DJANGO_SETTINGS_MODULE", "cc2olx.settings")
     django.setup()
 
 

diff --git a/src/cc2olx/models.py b/src/cc2olx/models.py
@@ -1,19 +1,16 @@
-import imghdr
 import logging
 import os.path
 import re
-from textwrap import dedent
 import zipfile
+from pathlib import Path
+from textwrap import dedent
+from typing import Optional
 
 from cc2olx import filesystem
-from cc2olx.constants import OLX_STATIC_PATH_TEMPLATE
 from cc2olx.dataclasses import OlxToOriginalStaticFilePaths
 from cc2olx.external.canvas import ModuleMeta
-from cc2olx.qti import QtiParser
 from cc2olx.utils import clean_file_name
 
-from .utils import simple_slug
-
 logger = logging.getLogger()
 
 MANIFEST = "imsmanifest.xml"
@@ -295,7 +292,7 @@ def flatten(self, container):
                 output.extend(leaves)
         return output
 
-    def define_resource(self, idref):
+    def define_resource(self, idref: Optional[str]) -> dict:
         """
         Define a resource by its identifier.
         """
@@ -305,104 +302,6 @@ def define_resource(self, idref):
             resource = self.resources_by_id.get(module_item_idref)
         return resource
 
-    def get_resource_content(self, identifier):
-        """
-        Get the resource named by `identifier`.
-
-        If the resource can be retrieved, returns a tuple: the first element
-        indicates the type of content, either "html" or "link".  The second
-        element is a dict with details, which vary by the type.
-
-        If the resource can't be retrieved, returns a tuple of None, None.
-
-        """
-        res = self.resources_by_id.get(identifier)
-        if res is None and self.is_canvas_flavor:
-            res = self.resources_by_id.get(self.module_meta.get_identifierref(identifier))
-        if res is None:
-            logger.info("Missing resource: %s", identifier)
-            return None, None
-
-        res_type = res["type"]
-
-        if res_type == "webcontent":
-            res_relative_path = res["children"][0].href
-            res_filename = self._res_filename(res_relative_path)
-            if res_filename.suffix == ".html":
-                try:
-                    with open(str(res_filename), encoding="utf-8") as res_file:
-                        html = res_file.read()
-                except:  # noqa: E722
-                    logger.error("Failure reading %s from id %s", res_filename, identifier)  # noqa: E722
-                    raise
-                return "html", {"html": html}
-            elif "web_resources" in str(res_filename) and imghdr.what(str(res_filename)):
-                static_filename = str(res_filename).split("web_resources/")[1]
-                olx_static_path = OLX_STATIC_PATH_TEMPLATE.format(static_filename=static_filename)
-                self.olx_to_original_static_file_paths.web_resources[olx_static_path] = static_filename
-                html = (
-                    '<html><head><meta http-equiv="Content-Type" content="text/html; charset=utf-8"/>'
-                    '</head><body><p><img src="{}" alt="{}"></p></body></html>'.format(olx_static_path, static_filename)
-                )
-                return "html", {"html": html}
-            elif "web_resources" not in str(res_filename):
-                olx_static_path = OLX_STATIC_PATH_TEMPLATE.format(static_filename=res_relative_path)
-                # This webcontent is outside of ``web_resources`` directory
-                # So we need to manually copy it to OLX_STATIC_DIR
-                self.olx_to_original_static_file_paths.extra[olx_static_path] = res_relative_path
-                html = (
-                    '<html><head><meta http-equiv="Content-Type" content="text/html; charset=utf-8"/>'
-                    '</head><body><p><a href="{}" alt="{}">{}<a></p></body></html>'.format(
-                        olx_static_path, res_relative_path, res_relative_path
-                    )
-                )
-                return "html", {"html": html}
-            else:
-                logger.info("Skipping webcontent: %s", res_filename)
-                return None, None
-
-        # Match any of imswl_xmlv1p1, imswl_xmlv1p2 etc
-        elif re.match(r"^imswl_xmlv\d+p\d+$", res_type):
-            tree = filesystem.get_xml_tree(self._res_filename(res["children"][0].href))
-            root = tree.getroot()
-            namespaces = {
-                "imswl_xmlv1p1": "http://www.imsglobal.org/xsd/imsccv1p1/imswl_v1p1",
-                "imswl_xmlv1p2": "http://www.imsglobal.org/xsd/imsccv1p2/imswl_v1p2",
-                "imswl_xmlv1p3": "http://www.imsglobal.org/xsd/imsccv1p3/imswl_v1p3",
-            }
-            ns = {"wl": namespaces[res_type]}
-            title = root.find("wl:title", ns).text
-            url = root.find("wl:url", ns).get("href")
-            return "link", {"href": url, "text": title}
-
-        # Match any of imsbasiclti_xmlv1p0, imsbasiclti_xmlv1p3 etc
-        elif re.match(r"^imsbasiclti_xmlv\d+p\d+$", res_type):
-            data = self._parse_lti(res)
-            # Canvas flavored courses have correct url in module meta for lti links
-            if self.is_canvas_flavor:
-                item_data = self.module_meta.get_external_tool_item_data(identifier)
-                if item_data:
-                    data["launch_url"] = item_data.get("url", data["launch_url"])
-            return "lti", data
-
-        # Match any of imsqti_xmlv1p2/imscc_xmlv1p1/assessment, imsqti_xmlv1p3/imscc_xmlv1p3/assessment etc
-        elif re.match(r"^imsqti_xmlv\d+p\d+/imscc_xmlv\d+p\d+/assessment$", res_type):
-            res_filename = self._res_filename(res["children"][0].href)
-            qti_parser = QtiParser(res_filename)
-            return "qti", qti_parser.parse_qti()
-
-        # Match any of imsdt_xmlv1p1, imsdt_xmlv1p2, imsdt_xmlv1p3 etc
-        elif re.match(r"^imsdt_xmlv\d+p\d+$", res_type):
-            data = self._parse_discussion(res, res_type)
-            return "discussion", data
-
-        else:
-            text = f"Unimported content: type = {res_type!r}"
-            if "href" in res:
-                text += ", href = {!r}".format(res["href"])
-            logger.info("%s", text)
-            return "html", {"html": text}
-
     def load_manifest_extracted(self):
         manifest = self._extract()
 
@@ -718,83 +617,3 @@ def _parse_dependency(self, node):
     def _parse_resource_metadata(self, node):
         # TODO: this
         return None
-
-    def _res_filename(self, file_name):
-        return self.directory / file_name
-
-    def _parse_lti(self, resource):
-        """
-        Parses LTI resource.
-        """
-
-        tree = filesystem.get_xml_tree(self._res_filename(resource["children"][0].href))
-        root = tree.getroot()
-        ns = {
-            "blti": "http://www.imsglobal.org/xsd/imsbasiclti_v1p0",
-            "lticp": "http://www.imsglobal.org/xsd/imslticp_v1p0",
-            "lticm": "http://www.imsglobal.org/xsd/imslticm_v1p0",
-        }
-        title = root.find("blti:title", ns).text
-        description = root.find("blti:description", ns).text
-        launch_url = root.find("blti:secure_launch_url", ns)
-        if launch_url is None:
-            launch_url = root.find("blti:launch_url", ns)
-        if launch_url is not None:
-            launch_url = launch_url.text
-        else:
-            launch_url = ""
-        width = root.find("blti:extensions/lticm:property[@name='selection_width']", ns)
-        if width is None:
-            width = "500"
-        else:
-            width = width.text
-        height = root.find("blti:extensions/lticm:property[@name='selection_height']", ns)
-        if height is None:
-            height = "500"
-        else:
-            height = height.text
-        custom = root.find("blti:custom", ns)
-        if custom is None:
-            parameters = dict()
-        else:
-            parameters = {option.get("name"): option.text for option in custom}
-        # For Canvas flavored CC, tool_id can be used as lti_id if present
-        tool_id = root.find("blti:extensions/lticm:property[@name='tool_id']", ns)
-        if tool_id is None:
-            # Create a simple slug lti_id from title
-            lti_id = simple_slug(title)
-        else:
-            lti_id = tool_id.text
-        data = {
-            "title": title,
-            "description": description,
-            "launch_url": launch_url,
-            "height": height,
-            "width": width,
-            "custom_parameters": parameters,
-            "lti_id": lti_id,
-        }
-        return data
-
-    def _parse_discussion(self, res, res_type):
-        """
-        Parses discussion content.
-        """
-
-        namespaces = {
-            "imsdt_xmlv1p1": "http://www.imsglobal.org/xsd/imsccv1p1/imsdt_v1p1",
-            "imsdt_xmlv1p2": "http://www.imsglobal.org/xsd/imsccv1p2/imsdt_v1p2",
-            "imsdt_xmlv1p3": "http://www.imsglobal.org/xsd/imsccv1p3/imsdt_v1p3",
-        }
-
-        data = {"dependencies": []}
-        for child in res["children"]:
-            if isinstance(child, ResourceFile):
-                tree = filesystem.get_xml_tree(self._res_filename(child.href))
-                root = tree.getroot()
-                ns = {"dt": namespaces[res_type]}
-                data["title"] = root.find("dt:title", ns).text
-                data["text"] = root.find("dt:text", ns).text
-            elif isinstance(child, ResourceDependency):
-                data["dependencies"].append(self.get_resource_content(child.identifierref))
-        return data