Download helpers

from IPython.display import Markdown,HTML
from fastcore.test import *

source

clean_md

 clean_md (text, rm_comments=True, rm_details=True)

Remove comments and <details> sections from text


source

read_md

 read_md (url, rm_comments=True, rm_details=True,
          params:QueryParamTypes|None=None, headers:HeaderTypes|None=None,
          cookies:CookieTypes|None=None, auth:AuthTypes|None=None,
          proxy:ProxyTypes|None=None, proxies:ProxiesTypes|None=None,
          follow_redirects:bool=False, cert:CertTypes|None=None,
          verify:VerifyTypes=True,
          timeout:TimeoutTypes=Timeout(timeout=5.0), trust_env:bool=True)

Read text from url and clean with clean_docs

mdurl = 'https://claudette.answer.ai/index.html.md'
md = read_md(mdurl)
# Markdown(md)

source

html2md

 html2md (s:str)

Convert s from HTML to markdown


source

read_html

 read_html (url, sel=None, rm_comments=True, rm_details=True)

Get url, optionally selecting CSS selector sel, and convert to clean markdown

htmlurl = 'https://hypermedia.systems/hypermedia-a-reintroduction/'
hmd = read_html(htmlurl)
# Markdown(hmd)

source

get_llmstxt

 get_llmstxt (url, optional=False, n_workers=None)

Get llms.txt file from and expand it with llms_txt.create_ctx()

# print(get_llmstxt('https://llmstxt.org/llms.txt'))

source

split_url

 split_url (url)

Split url into base, path, and file name, normalising name to ‘/’ if empty

urls = ('https://claudette.answer.ai/path/index.html', 'https://claudette.answer.ai/',
        'https://claudette.answer.ai/index.html', 'https://llmstxt.org', 'https://llmstxt.org/')

[split_url(o) for o in urls]
[('https://claudette.answer.ai', '/path', '/index.html'),
 ('https://claudette.answer.ai', '/', ''),
 ('https://claudette.answer.ai', '', '/index.html'),
 ('https://llmstxt.org', '/', ''),
 ('https://llmstxt.org', '/', '')]

source

find_docs

 find_docs (url)

If available, return LLM-friendly llms.txt context or markdown file location from url

fl_url = 'https://answerdotai.github.io/fastlite'
find_docs(fl_url)
'https://answerdotai.github.io/fastlite/index.html.md'
for o in urls: print(find_docs(o))
https://claudette.answer.ai/index.html.md
https://claudette.answer.ai/index.html.md
https://claudette.answer.ai/index.html.md
https://llmstxt.org/llms.txt
https://llmstxt.org/llms.txt
suffixes = ["/", "/tmp", "/tmp/", "/tmp/tmp", "/tmp/tmp/"]
for suff in suffixes:
    for o in urls: test_eq(find_docs(o), find_docs(o+suff))
test_eq(find_docs("https://github.com"), "https://github.com/llms.txt")
test_eq(find_docs("https://github.com/AnswerDotAI"), "https://github.com/llms.txt")
test_eq(find_docs("https://github.com/AnswerDotAI/"), "https://github.com/llms.txt")

source

read_docs

 read_docs (url, optional=False, n_workers=None, rm_comments=True,
            rm_details=True)

If available, return LLM-friendly llms.txt context or markdown file response for url