feat: Block type processors are implemented

raccoongang · Jan 7, 2025 · e214c7d · e214c7d
1 parent 599fb96
commit e214c7d
Show file tree

Hide file tree

Showing 23 changed files with 1,576 additions and 0 deletions.
diff --git a/src/cc2olx/constants.py b/src/cc2olx/constants.py
@@ -0,0 +1,11 @@
+OLX_STATIC_DIR = "static"
+OLX_STATIC_PATH_TEMPLATE = f"/{OLX_STATIC_DIR}/{{static_filename}}"
+WEB_RESOURCES_DIR_NAME = "web_resources"
+
+WEB_LINK_NAMESPACE = (
+    "http://www.imsglobal.org/xsd/imsccv{major_version}p{minor_version}/imswl_v{major_version}p{minor_version}"
+)
+YOUTUBE_LINK_PATTERN = r"youtube.com/watch\?v=(?P<video_id>[-\w]+)"
+LINK_HTML = "<a href='{url}'>{text}</a>"
+
+QTI_RESPROCESSING_TYPES = ["general_fb", "correct_fb", "general_incorrect_fb"]
diff --git a/src/cc2olx/content_parsers/__init__.py b/src/cc2olx/content_parsers/__init__.py
@@ -0,0 +1,6 @@
+from cc2olx.content_parsers.abc import AbstractContentParser
+from cc2olx.content_parsers.discussion import DiscussionContentParser
+from cc2olx.content_parsers.html import HtmlContentParser
+from cc2olx.content_parsers.lti import LtiContentParser
+from cc2olx.content_parsers.qti import QtiContentParser
+from cc2olx.content_parsers.video import VideoContentParser
diff --git a/src/cc2olx/content_parsers/abc.py b/src/cc2olx/content_parsers/abc.py
@@ -0,0 +1,29 @@
+from abc import ABC, abstractmethod
+from typing import Optional, Union
+
+from cc2olx.content_parsers.utils import StaticLinkProcessor
+from cc2olx.models import Cartridge
+
+
+class AbstractContentParser(ABC):
+    """
+    Abstract base class for parsing Common Cartridge content.
+    """
+
+    def __init__(self, cartridge: Cartridge) -> None:
+        self._cartridge = cartridge
+
+    def parse(self, idref: Optional[str]) -> Optional[Union[list, dict]]:
+        """
+        Parse the resource with the specified identifier.
+        """
+        if content := self._parse_content(idref):
+            link_processor = StaticLinkProcessor(self._cartridge)
+            content = link_processor.process_content_static_links(content)
+        return content
+
+    @abstractmethod
+    def _parse_content(self, idref: Optional[str]) -> Optional[Union[list, dict]]:
+        """
+        Parse content of the resource with the specified identifier.
+        """
diff --git a/src/cc2olx/content_parsers/discussion.py b/src/cc2olx/content_parsers/discussion.py
@@ -0,0 +1,51 @@
+import re
+from typing import Dict, Optional
+
+from cc2olx import filesystem
+from cc2olx.content_parsers import AbstractContentParser
+from cc2olx.enums import CommonCartridgeResourceType
+from cc2olx.models import ResourceFile
+
+
+class DiscussionContentParser(AbstractContentParser):
+    """
+    Discussion resource content parser.
+    """
+
+    NAMESPACES = {
+        "imsdt_xmlv1p1": "http://www.imsglobal.org/xsd/imsccv1p1/imsdt_v1p1",
+        "imsdt_xmlv1p2": "http://www.imsglobal.org/xsd/imsccv1p2/imsdt_v1p2",
+        "imsdt_xmlv1p3": "http://www.imsglobal.org/xsd/imsccv1p3/imsdt_v1p3",
+    }
+
+    def _parse_content(self, idref: Optional[str]) -> Optional[Dict[str, str]]:
+        if (
+            idref
+            and (resource := self._cartridge.define_resource(idref))
+            and re.match(CommonCartridgeResourceType.DISCUSSION_TOPIC, resource["type"])
+        ):
+            data = self._parse_discussion(resource)
+            return data
+
+    def _parse_discussion(self, resource: dict) -> Dict[str, str]:
+        """
+        Parse the discussion content.
+        """
+        data = {}
+
+        for child in resource["children"]:
+            if isinstance(child, ResourceFile):
+                data.update(self._parse_resource_file_data(child, resource["type"]))
+
+        return data
+
+    def _parse_resource_file_data(self, resource_file: ResourceFile, resource_type: str) -> Dict[str, str]:
+        """
+        Parse the discussion resource file.
+        """
+        tree = filesystem.get_xml_tree(self._cartridge.build_res_file_path(resource_file.href))
+        root = tree.getroot()
+        ns = {"dt": self.NAMESPACES[resource_type]}
+        title = root.find("dt:title", ns).text
+        text = root.find("dt:text", ns).text
+        return {"title": title, "text": text}
diff --git a/src/cc2olx/content_parsers/html.py b/src/cc2olx/content_parsers/html.py
@@ -0,0 +1,133 @@
+import imghdr
+import logging
+import re
+from pathlib import Path
+from typing import Dict, Optional
+
+from cc2olx import settings
+from cc2olx.constants import LINK_HTML, OLX_STATIC_PATH_TEMPLATE, WEB_RESOURCES_DIR_NAME
+from cc2olx.content_parsers import AbstractContentParser
+from cc2olx.content_parsers.mixins import WebLinkParserMixin
+from cc2olx.enums import CommonCartridgeResourceType
+
+logger = logging.getLogger()
+
+HTML_FILENAME_SUFFIX = ".html"
+
+
+class HtmlContentParser(WebLinkParserMixin, AbstractContentParser):
+    """
+    HTML resource content parser.
+    """
+
+    DEFAULT_CONTENT = {"html": "<p>MISSING CONTENT</p>"}
+
+    def _parse_content(self, idref: Optional[str]) -> Dict[str, str]:
+        if idref:
+            if (resource := self._cartridge.define_resource(idref)) is None:
+                logger.info("Missing resource: %s", idref)
+                return self.DEFAULT_CONTENT
+
+            if resource["type"] == CommonCartridgeResourceType.WEB_CONTENT:
+                content = self._parse_webcontent(idref, resource)
+            elif web_link_content := self._parse_web_link_content(resource):
+                content = self._transform_web_link_content_to_html(web_link_content)
+            elif (
+                any(
+                    re.match(resource_type, resource["type"]) for resource_type
+                    in (
+                        CommonCartridgeResourceType.LTI_LINK,
+                        CommonCartridgeResourceType.QTI_ASSESSMENT,
+                        CommonCartridgeResourceType.DISCUSSION_TOPIC,
+                    )
+                )
+            ):
+                content = self.DEFAULT_CONTENT
+            else:
+                content = self._parse_not_imported_content(resource)
+            return content
+        return self.DEFAULT_CONTENT
+
+    def _parse_webcontent(self, idref: str, resource: dict) -> Dict[str, str]:
+        """
+        Parse the resource with "webcontent" type.
+        """
+        res_relative_path = resource["children"][0].href
+        res_file_path = self._cartridge.build_res_file_path(res_relative_path)
+
+        if res_file_path.suffix == HTML_FILENAME_SUFFIX:
+            content = self._parse_webcontent_html_file(idref, res_file_path)
+        elif WEB_RESOURCES_DIR_NAME in str(res_file_path) and imghdr.what(str(res_file_path)):
+            content = self._parse_image_webcontent_from_web_resources_dir(res_file_path)
+        elif WEB_RESOURCES_DIR_NAME not in str(res_file_path):
+            content = self._parse_webcontent_outside_web_resources_dir(res_relative_path)
+        else:
+            logger.info("Skipping webcontent: %s", res_file_path)
+            content = self.DEFAULT_CONTENT
+
+        return content
+
+    @staticmethod
+    def _parse_webcontent_html_file(idref: str, res_file_path: Path) -> Dict[str, str]:
+        """
+        Parse webcontent HTML file.
+        """
+        try:
+            with open(res_file_path, encoding="utf-8") as res_file:
+                html = res_file.read()
+        except:  # noqa: E722
+            logger.error("Failure reading %s from id %s", res_file_path, idref)  # noqa: E722
+            raise
+        return {"html": html}
+
+    @staticmethod
+    def _parse_image_webcontent_from_web_resources_dir(res_file_path: Path) -> Dict[str, str]:
+        """
+        Parse webcontent image from "web_resources" directory.
+        """
+        static_filename = str(res_file_path).split(f"{WEB_RESOURCES_DIR_NAME}/")[1]
+        olx_static_path = OLX_STATIC_PATH_TEMPLATE.format(static_filename=static_filename)
+        image_webcontent_tpl_path = settings.TEMPLATES_DIR / "image_webcontent.html"
+
+        with open(image_webcontent_tpl_path, encoding="utf-8") as image_webcontent_tpl:
+            tpl_content = image_webcontent_tpl.read()
+            html = tpl_content.format(olx_static_path=olx_static_path, static_filename=static_filename)
+
+        return {"html": html}
+
+    def _parse_webcontent_outside_web_resources_dir(self, res_relative_path: str) -> Dict[str, str]:
+        """
+        Parse webcontent located outside "web_resources" directory.
+        """
+        # This webcontent is outside ``web_resources`` directory
+        # So we need to manually copy it to OLX_STATIC_DIR
+        self._cartridge.add_extra_static_file(res_relative_path)
+        olx_static_path = OLX_STATIC_PATH_TEMPLATE.format(static_filename=res_relative_path)
+        external_webcontent_tpl_path = settings.TEMPLATES_DIR / "external_webcontent.html"
+
+        with open(external_webcontent_tpl_path, encoding="utf-8") as external_webcontent_tpl:
+            tpl_content = external_webcontent_tpl.read()
+            html = tpl_content.format(olx_static_path=olx_static_path, res_relative_path=res_relative_path)
+
+        return {"html": html}
+
+    @staticmethod
+    def _transform_web_link_content_to_html(web_link_content: Dict[str, str]) -> Dict[str, str]:
+        """
+        Generate HTML for weblink.
+        """
+        video_link_html = LINK_HTML.format(url=web_link_content["href"], text=web_link_content.get("text", ""))
+        return {"html": video_link_html}
+
+    @staticmethod
+    def _parse_not_imported_content(resource: dict) -> Dict[str, str]:
+        """
+        Parse the resource which content type cannot be processed.
+        """
+        resource_type = resource["type"]
+        text = f"Not imported content: type = {resource_type!r}"
+        if "href" in resource:
+            text += ", href = {!r}".format(resource["href"])
+
+        logger.info("%s", text)
+        return {"html": text}
diff --git a/src/cc2olx/content_parsers/lti.py b/src/cc2olx/content_parsers/lti.py
@@ -0,0 +1,97 @@
+import re
+from typing import Dict, Optional
+
+from lxml import etree
+
+from cc2olx import filesystem
+from cc2olx.content_parsers import AbstractContentParser
+from cc2olx.enums import CommonCartridgeResourceType
+from cc2olx.utils import simple_slug
+
+
+class LtiContentParser(AbstractContentParser):
+    """
+    LTI resource content parser.
+    """
+
+    NAMESPACES = {
+        "blti": "http://www.imsglobal.org/xsd/imsbasiclti_v1p0",
+        "lticp": "http://www.imsglobal.org/xsd/imslticp_v1p0",
+        "lticm": "http://www.imsglobal.org/xsd/imslticm_v1p0",
+    }
+    DEFAULT_WIDTH = "500"
+    DEFAULT_HEIGHT = "500"
+
+    def _parse_content(self, idref: Optional[str]) -> Optional[dict]:
+        if (
+            idref
+            and (resource := self._cartridge.define_resource(idref))
+            and re.match(CommonCartridgeResourceType.LTI_LINK, resource["type"])
+        ):
+            data = self._parse_lti(resource)
+            # Canvas flavored courses have correct url in module meta for lti links
+            if self._cartridge.is_canvas_flavor:
+                if item_data := self._cartridge.module_meta.get_external_tool_item_data(idref):
+                    data["launch_url"] = item_data.get("url", data["launch_url"])
+            return data
+        return None
+
+    def _parse_lti(self, resource: dict) -> dict:
+        """
+        Parse LTI resource.
+        """
+        res_file_path = self._cartridge.build_res_file_path(resource["children"][0].href)
+        tree = filesystem.get_xml_tree(res_file_path)
+        root = tree.getroot()
+        title = root.find("blti:title", self.NAMESPACES).text
+        description = root.find("blti:description", self.NAMESPACES).text
+        data = {
+            "title": title,
+            "description": description,
+            "launch_url": self._parse_launch_url(root),
+            "height": self._parse_height(root),
+            "width": self._parse_width(root),
+            "custom_parameters": self._parse_custom_parameters(root),
+            "lti_id": self._parse_lti_id(root, title),
+        }
+        return data
+
+    def _parse_launch_url(self, resource_root: etree._Element) -> str:
+        """
+        Parse URL to launch LTI.
+        """
+        if (launch_url := resource_root.find("blti:secure_launch_url", self.NAMESPACES)) is None:
+            launch_url = resource_root.find("blti:launch_url", self.NAMESPACES)
+        return "" if launch_url is None else launch_url.text
+
+    def _parse_width(self, resource_root: etree._Element) -> str:
+        """
+        Parse width.
+        """
+        width = resource_root.find("blti:extensions/lticm:property[@name='selection_width']", self.NAMESPACES)
+        return self.DEFAULT_WIDTH if width is None else width.text
+
+    def _parse_height(self, resource_root: etree._Element) -> str:
+        """
+        Parse height.
+        """
+        height = resource_root.find("blti:extensions/lticm:property[@name='selection_height']", self.NAMESPACES)
+        return self.DEFAULT_HEIGHT if height is None else height.text
+
+    def _parse_custom_parameters(self, resource_root: etree._Element) -> Dict[str, str]:
+        """
+        Parse custom parameters.
+        """
+        custom = resource_root.find("blti:custom", self.NAMESPACES)
+        return {} if custom is None else {option.get("name"): option.text for option in custom}
+
+    def _parse_lti_id(self, resource_root: etree._Element, title: str) -> str:
+        """
+        Parse LTI identifier.
+        """
+        # For Canvas flavored CC, tool_id can be used as lti_id if present
+        tool_id = resource_root.find("blti:extensions/lticm:property[@name='tool_id']", self.NAMESPACES)
+        return (
+            simple_slug(title) if tool_id is None  # Create a simple slug lti_id from title
+            else tool_id.text
+        )
diff --git a/src/cc2olx/content_parsers/mixins.py b/src/cc2olx/content_parsers/mixins.py
@@ -0,0 +1,40 @@
+import re
+from typing import Dict, Optional
+
+from cc2olx import filesystem
+from cc2olx.constants import WEB_LINK_NAMESPACE
+from cc2olx.enums import CommonCartridgeResourceType
+from cc2olx.models import Cartridge
+
+
+class WebLinkParserMixin:
+    """
+    Provide Common Cartridge Web Link resource parsing functionality.
+    """
+
+    _cartridge: Cartridge
+
+    def _parse_web_link_content(self, resource: dict) -> Optional[Dict[str, str]]:
+        """
+        Provide Web Link resource data.
+        """
+        if web_link_match := re.match(CommonCartridgeResourceType.WEB_LINK, resource["type"]):
+            res_file_path = self._cartridge.build_res_file_path(resource["children"][0].href)
+            tree = filesystem.get_xml_tree(res_file_path)
+            root = tree.getroot()
+            ns = self._build_web_link_namespace(web_link_match)
+            title = root.find("wl:title", ns).text
+            url = root.find("wl:url", ns).get("href")
+            return {"href": url, "text": title}
+        return None
+
+    @staticmethod
+    def _build_web_link_namespace(web_link_match: re.Match) -> Dict[str, str]:
+        """
+        Build Web Link namespace.
+        """
+        web_link = WEB_LINK_NAMESPACE.format(
+            major_version=web_link_match.group("major_version"),
+            minor_version=web_link_match.group("minor_version"),
+        )
+        return {"wl": web_link}