from IPython.display import Markdown,HTML
from fastcore.test import *
Download helpers
clean_md
clean_md (text, rm_comments=True, rm_details=True)
Remove comments and <details>
sections from text
read_md
read_md (url, rm_comments=True, rm_details=True, params:QueryParamTypes|None=None, headers:HeaderTypes|None=None, cookies:CookieTypes|None=None, auth:AuthTypes|None=None, proxy:ProxyTypes|None=None, proxies:ProxiesTypes|None=None, follow_redirects:bool=False, cert:CertTypes|None=None, verify:VerifyTypes=True, timeout:TimeoutTypes=Timeout(timeout=5.0), trust_env:bool=True)
Read text from url
and clean with clean_docs
= 'https://claudette.answer.ai/index.html.md'
mdurl = read_md(mdurl)
md # Markdown(md)
html2md
html2md (s:str)
Convert s
from HTML to markdown
read_html
read_html (url, sel=None, rm_comments=True, rm_details=True)
Get url
, optionally selecting CSS selector sel
, and convert to clean markdown
= 'https://hypermedia.systems/hypermedia-a-reintroduction/'
htmlurl = read_html(htmlurl)
hmd # Markdown(hmd)
get_llmstxt
get_llmstxt (url, optional=False, n_workers=None)
Get llms.txt file from and expand it with llms_txt.create_ctx()
# print(get_llmstxt('https://llmstxt.org/llms.txt'))
split_url
split_url (url)
Split url
into base, path, and file name, normalising name to ‘/’ if empty
= ('https://claudette.answer.ai/path/index.html', 'https://claudette.answer.ai/',
urls 'https://claudette.answer.ai/index.html', 'https://llmstxt.org', 'https://llmstxt.org/')
for o in urls] [split_url(o)
[('https://claudette.answer.ai', '/path', '/index.html'),
('https://claudette.answer.ai', '/', ''),
('https://claudette.answer.ai', '', '/index.html'),
('https://llmstxt.org', '/', ''),
('https://llmstxt.org', '/', '')]
find_docs
find_docs (url)
If available, return LLM-friendly llms.txt context or markdown file location from url
= 'https://answerdotai.github.io/fastlite' fl_url
find_docs(fl_url)
'https://answerdotai.github.io/fastlite/index.html.md'
for o in urls: print(find_docs(o))
https://claudette.answer.ai/index.html.md
https://claudette.answer.ai/index.html.md
https://claudette.answer.ai/index.html.md
https://llmstxt.org/llms.txt
https://llmstxt.org/llms.txt
= ["/", "/tmp", "/tmp/", "/tmp/tmp", "/tmp/tmp/"]
suffixes for suff in suffixes:
for o in urls: test_eq(find_docs(o), find_docs(o+suff))
"https://github.com"), "https://github.com/llms.txt")
test_eq(find_docs("https://github.com/AnswerDotAI"), "https://github.com/llms.txt")
test_eq(find_docs("https://github.com/AnswerDotAI/"), "https://github.com/llms.txt") test_eq(find_docs(
read_docs
read_docs (url, optional=False, n_workers=None, rm_comments=True, rm_details=True)
If available, return LLM-friendly llms.txt context or markdown file response for url