from IPython.display import Markdown,HTML
from fastcore.test import *Download helpers
clean_md
def clean_md(
text, rm_comments:bool=True, rm_details:bool=True
):
Remove comments and <details> sections from text
read_md
def read_md(
url, rm_comments:bool=True, rm_details:bool=True, params:QueryParamTypes | None=None,
headers:HeaderTypes | None=None, cookies:CookieTypes | None=None, auth:AuthTypes | None=None,
proxy:ProxyTypes | None=None, follow_redirects:bool=False, verify:ssl.SSLContext | str | bool=True,
timeout:TimeoutTypes=Timeout(timeout=5.0), trust_env:bool=True
):
Read text from url and clean with clean_docs
mdurl = 'https://claudette.answer.ai/index.html.md'
md = read_md(mdurl)
# Markdown(md)html2md
def html2md(
s:str, ignore_links:bool=True
):
Convert s from HTML to markdown
read_html
def read_html(
url, # URL to read
sel:NoneType=None, # Read only outerHTML of CSS selector `sel`
rm_comments:bool=True, # Removes HTML comments
rm_details:bool=True, # Removes `<details>` tags
multi:bool=False, # Get all matches to `sel` or first one
wrap_tag:NoneType=None, # If multi, each selection wrapped with <wrap_tag>content</wrap_tag>
ignore_links:bool=True
):
Get url, optionally selecting CSS selector sel, and convert to clean markdown
# test single class selector
listings = read_html('https://www.answer.ai/', sel='.listing-description')
assert len(listings) < 500
# Test multi class selector
listings = read_html('https://www.answer.ai/', sel='.listing-description', multi=True)
assert len(listings) > 1000 # returns more than single so selecting multi
# Test multi_wrap_tag
listings = read_html('https://www.answer.ai/', sel='.listing-description', multi=True, wrap_tag='document')
assert '<document>' in listings and '</document>' in listingsread_html('https://www.answer.ai/', sel='.listing-description', ignore_links=False)'[ How I created a book chapter from video transcripts with SolveIt ](./posts/2025-10-13-video-to-doc.html)\n\n'
# test tag css selectors
assert len(read_html('https://www.answer.ai/', sel='div.listing-description', multi=True)) > 1000
assert len(read_html('https://www.answer.ai/', sel='div', multi=True)) > 1000htmlurl = 'https://hypermedia.systems/hypermedia-a-reintroduction/'
hmd = read_html(htmlurl)
assert len(hmd) > 100
# Markdown(hmd)get_llmstxt
def get_llmstxt(
url, optional:bool=False, n_workers:NoneType=None
):
Get llms.txt file from and expand it with llms_txt.create_ctx()
# print(get_llmstxt('https://llmstxt.org/llms.txt'))split_url
def split_url(
url
):
Split url into base, path, and file name, normalising name to ‘/’ if empty
urls = ('https://claudette.answer.ai/path/', 'https://claudette.answer.ai/', 'https://llmstxt.org', 'https://llmstxt.org/')
[split_url(o) for o in urls][('https://claudette.answer.ai', '', '/path'),
('https://claudette.answer.ai', '/', ''),
('https://llmstxt.org', '/', ''),
('https://llmstxt.org', '/', '')]
find_docs
def find_docs(
url
):
If available, return LLM-friendly llms.txt context or markdown file location from url
fl_url = 'https://answerdotai.github.io/fastlite'find_docs(fl_url)'https://answerdotai.github.io/fastlite/llms.txt'
for o in urls: print(find_docs(o))https://claudette.answer.ai/llms.txt
https://claudette.answer.ai/llms.txt
https://llmstxt.org/llms.txt
https://llmstxt.org/llms.txt
suffixes = ["/", "/tmp", "/tmp/tmp/"]
for suff in suffixes:
for o in urls: test_eq(find_docs(o), find_docs(o+suff))
test_eq(find_docs("https://github.com"), "https://github.com/llms.txt")
test_eq(find_docs("https://github.com/AnswerDotAI"), "https://github.com/llms.txt")
test_eq(find_docs("https://github.com/AnswerDotAI/"), "https://github.com/llms.txt")read_docs
def read_docs(
url, optional:bool=False, n_workers:NoneType=None, rm_comments:bool=True, rm_details:bool=True
):
If available, return LLM-friendly llms.txt context or markdown file response for url