fixes #6

jph00 · jph00 · commit 0536f0cda3a8 · 2024-09-08T14:48:59.000+10:00
diff --git a/llms_txt/core.py b/llms_txt/core.py
@@ -25,7 +25,8 @@ def named_re(nm, pat):
 
 def search(pat, txt, flags=0):
     "Dictionary of matched groups in `pat` within `txt`"
-    return re.search(pat, txt, flags=flags).groupdict()
+    res = re.search(pat, txt, flags=flags)
+    return res.groupdict() if res else None
 
 # %% ../nbs/01_core.ipynb
 def parse_link(txt):
@@ -65,22 +66,23 @@ def parse_llms_file(txt):
 from fastcore.xml import Sections,Project,Doc
 
 # %% ../nbs/01_core.ipynb
-def _doc(url, **kw):
+def _doc(kw):
     "Create a `Doc` FT object with the text retrieved from `url` as the child, and `kw` as attrs."
+    url = kw.pop('url')
     re_comment = re.compile('^<!--.*-->$', flags=re.MULTILINE)
     txt = [o for o in httpx.get(url).text.splitlines() if not re_comment.search(o)]
     return Doc('\n'.join(txt), **kw)
 
 # %% ../nbs/01_core.ipynb
-def _section(nm, items):
+def _section(nm, items, n_workers=None):
     "Create a section containing a `Doc` object for each child."
-    return ft(nm, *[_doc(**o) for o in items])
+    return ft(nm, *parallel(_doc, items, n_workers=n_workers, threadpool=True))
 
 # %% ../nbs/01_core.ipynb
-def mk_ctx(d, optional=True):
+def mk_ctx(d, optional=True, n_workers=None):
     "Create a `Project` with a `Section` for each H2 part in `d`, optionally skipping the 'optional' section."
     skip = '' if optional else 'Optional'
-    sections = [_section(k, v) for k,v in d.sections.items() if k!=skip]
+    sections = [_section(k, v, n_workers=n_workers) for k,v in d.sections.items() if k!=skip]
     return Project(title=d.title, summary=d.summary)(d.info, *sections)
 
 # %% ../nbs/01_core.ipynb
@@ -89,17 +91,18 @@ def get_sizes(ctx):
     return {o.tag:{p.title:len(p.children[0]) for p in o.children} for o in ctx.children if hasattr(o,'tag')}
 
 # %% ../nbs/01_core.ipynb
-def create_ctx(txt, optional=False):
+def create_ctx(txt, optional=False, n_workers=None):
     "A `Project` with a `Section` for each H2 part in `txt`, optionally skipping the 'optional' section."
     d = parse_llms_file(txt)
-    ctx = mk_ctx(d, optional=optional)
+    ctx = mk_ctx(d, optional=optional, n_workers=n_workers)
     return to_xml(ctx, do_escape=False)
 
 # %% ../nbs/01_core.ipynb
 @call_parse
 def llms_txt2ctx(
     fname:str, # File name to read
-    optional:bool_arg=False # Include 'optional' section?
+    optional:bool_arg=False, # Include 'optional' section?
+    n_workers:int=None # Number of threads to use for parallel downloading
 ):
     "Print a `Project` with a `Section` for each H2 part in file read from `fname`, optionally skipping the 'optional' section."
-    print(create_ctx(Path(fname).read_text(), optional=optional))
+    print(create_ctx(Path(fname).read_text(), optional=optional, n_workers=n_workers))
diff --git a/nbs/01_core.ipynb b/nbs/01_core.ipynb
@@ -139,7 +139,8 @@
     "\n",
     "def search(pat, txt, flags=0):\n",
     "    \"Dictionary of matched groups in `pat` within `txt`\"\n",
-    "    return re.search(pat, txt, flags=flags).groupdict()"
+    "    res = re.search(pat, txt, flags=flags)\n",
+    "    return res.groupdict() if res else None"
    ]
   },
   {
@@ -679,8 +680,9 @@
    "outputs": [],
    "source": [
     "#| export\n",
-    "def _doc(url, **kw):\n",
+    "def _doc(kw):\n",
     "    \"Create a `Doc` FT object with the text retrieved from `url` as the child, and `kw` as attrs.\"\n",
+    "    url = kw.pop('url')\n",
     "    re_comment = re.compile('^<!--.*-->$', flags=re.MULTILINE)\n",
     "    txt = [o for o in httpx.get(url).text.splitlines() if not re_comment.search(o)]\n",
     "    return Doc('\\n'.join(txt), **kw)"
@@ -693,9 +695,9 @@
    "outputs": [],
    "source": [
     "#| export\n",
-    "def _section(nm, items):\n",
+    "def _section(nm, items, n_workers=None):\n",
     "    \"Create a section containing a `Doc` object for each child.\"\n",
-    "    return ft(nm, *[_doc(**o) for o in items])"
+    "    return ft(nm, *parallel(_doc, items, n_workers=n_workers, threadpool=True))"
    ]
   },
   {
@@ -705,10 +707,10 @@
    "outputs": [],
    "source": [
     "#| export\n",
-    "def mk_ctx(d, optional=True):\n",
+    "def mk_ctx(d, optional=True, n_workers=None):\n",
     "    \"Create a `Project` with a `Section` for each H2 part in `d`, optionally skipping the 'optional' section.\"\n",
     "    skip = '' if optional else 'Optional'\n",
-    "    sections = [_section(k, v) for k,v in d.sections.items() if k!=skip]\n",
+    "    sections = [_section(k, v, n_workers=n_workers) for k,v in d.sections.items() if k!=skip]\n",
     "    return Project(title=d.title, summary=d.summary)(d.info, *sections)"
    ]
   },
@@ -753,10 +755,10 @@
     {
      "data": {
       "text/plain": [
-       "{'docs': {'FastHTML quick start': 25803,\n",
+       "{'docs': {'FastHTML quick start': 27376,\n",
        "  'HTMX reference': 26427,\n",
        "  'Starlette quick guide': 7936},\n",
-       " 'examples': {'Todo list application': 18588},\n",
+       " 'examples': {'Todo list application': 18558},\n",
        " 'optional': {'Starlette full documentation': 48331}}"
       ]
      },
@@ -777,7 +779,7 @@
     {
      "data": {
       "text/plain": [
-       "128271"
+       "129814"
       ]
      },
      "execution_count": null,
@@ -796,10 +798,10 @@
    "outputs": [],
    "source": [
     "#| export\n",
-    "def create_ctx(txt, optional=False):\n",
+    "def create_ctx(txt, optional=False, n_workers=None):\n",
     "    \"A `Project` with a `Section` for each H2 part in `txt`, optionally skipping the 'optional' section.\"\n",
     "    d = parse_llms_file(txt)\n",
-    "    ctx = mk_ctx(d, optional=optional)\n",
+    "    ctx = mk_ctx(d, optional=optional, n_workers=n_workers)\n",
     "    return to_xml(ctx, do_escape=False)"
    ]
   },
@@ -813,10 +815,11 @@
     "@call_parse\n",
     "def llms_txt2ctx(\n",
     "    fname:str, # File name to read\n",
-    "    optional:bool_arg=False # Include 'optional' section?\n",
+    "    optional:bool_arg=False, # Include 'optional' section?\n",
+    "    n_workers:int=None # Number of threads to use for parallel downloading\n",
     "):\n",
     "    \"Print a `Project` with a `Section` for each H2 part in file read from `fname`, optionally skipping the 'optional' section.\"\n",
-    "    print(create_ctx(Path(fname).read_text(), optional=optional))"
+    "    print(create_ctx(Path(fname).read_text(), optional=optional, n_workers=n_workers))"
    ]
   },
   {
diff --git a/nbs/llms.txt b/nbs/llms.txt
@@ -4,7 +4,7 @@
 
 ## Docs
 
-- [llms.txt proposal](https://llmstxt.org/index-commonmark.md): The proposal for llms.txt
+- [llms.txt proposal](https://llmstxt.org/index.md): The proposal for llms.txt
 - [Python library docs](https://llmstxt.org/intro.html.md): Docs for `llms-txt` python lib
 - [ed demo](https://llmstxt.org/ed-commonmark.md): Tongue-in-cheek example of how llms.txt could be used in the classic `ed` editor, used to show how editors could incorporate llms.txt in general.