from copy import deepcopy
from fastcore.test import *xml source
JSON to XML
json_to_xml
def json_to_xml(
d:dict, # JSON dictionary to convert
rnm:str, # Root name
)->str:
Convert d to XML.
JSON doesn’t map as nicely to XML as the data structure used in fastcore.xml, but for simple XML trees it can be convenient – for example:
a = dict(surname='Howard', firstnames=['Jeremy','Peter'],
address=dict(state='Queensland',country='Australia'))
hl_md(json_to_xml(a, 'person'))<person>
<surname>Howard</surname>
<firstnames>
<item>Jeremy</item>
<item>Peter</item>
</firstnames>
<address>
<state>Queensland</state>
<country>Australia</country>
</address>
</person>Including documents
Notebooks
nbp = Path('00_xml.ipynb')
nb = dict2obj(nbp.read_json())
cells = nb.cells
cell = cells[7]
cell{ 'cell_type': 'code',
'execution_count': {},
'id': '005a5be4',
'metadata': {},
'outputs': [],
'source': ["a = dict(surname='Howard', firstnames=['Jeremy','Peter'],\n", " address=dict(state='Queensland',country='Australia'))\n", "hl_md(json_to_xml(a, 'person'))"]}get_mime_text
def get_mime_text(
data
):
Get text from MIME bundle, preferring markdown over plain
cell2out
def cell2out(
o
):
Convert single notebook output to XML format
for o in cell.outputs: print(to_xml(cell2out(o), do_escape=False))cell2xml
def cell2xml(
cell, out:bool=True, ids:bool=True, nums:bool=False
):
Convert notebook cell to concise XML format
hl_md(cell2xml(cell, out=False, nums=True))<code id="005a5be4"> 1 │ a = dict(surname='Howard', firstnames=['Jeremy','Peter'],
2 │ address=dict(state='Queensland',country='Australia'))
3 │ hl_md(json_to_xml(a, 'person'))</code>nb2xml
def nb2xml(
fname:NoneType=None, nb:NoneType=None, out:bool=True, ids:bool=True, nums:bool=False
):
Convert notebook to XML format
cells2xml
def cells2xml(
cells, wrap:function=<function __getattr__.<locals>._f at 0x7f9c3fb7e7a0>, out:bool=True, ids:bool=True,
nums:bool=False
):
Convert notebook to XML format
nbsml = deepcopy(nb)
del(nbsml.cells[2:])
hl_md(nb2xml(nb=nbsml))<notebook><code id="efe78920"><source>#|default_exp xml</code><md id="87ea05a3"><source># xml source</md></notebook>hl_md(nb2xml(nb=nbsml, ids=False))<notebook><code><source>#|default_exp xml</code><md><source># xml source</md></notebook>py2sigs
def py2sigs(
fname:NoneType=None, src:NoneType=None
):
Return signature+docstring text for all functions and class methods in source
get_docstring
def get_docstring(
node, lines
):
Get docstring from source lines if present
Documents
According to Anthropic, “it’s essential to structure your prompts in a way that clearly separates the input data from the instructions”. They recommend using something like the following:
Here are some documents for you to reference for your task:
<documents>
<document index="1">
<source>
(URL, file name, hash, etc)
</source>
<document_content>
(the text content)
</document_content>
</document>
</documents>We will create some small helper functions to make it easier to generate context in this format, although we’re use <src> instead of <source> to avoid conflict with that HTML tag. Although it’s based on Anthropic’s recommendation, it’s likely to work well with other models too.
We’ll use doctype to store our pairs.
Since Anthropic’s example shows newlines before and after each tag, we’ll do the same.
to_xml(Src('a'))'<src>a</src>'
to_xml(Document('a'))'<document>a</document>'
mk_doctype
def mk_doctype(
content:str, # The document content
src:Optional=None, # URL, filename, etc; defaults to `md5(content)` if not provided
)->namedtuple:
Create a doctype named tuple
This is a convenience wrapper to ensure that a doctype has the needed information in the right format.
doc = 'This is a "sample"'
mk_doctype(doc)doctype(src='\n47e19350\n', content='\nThis is a "sample"\n')
mk_doc
def mk_doc(
index:int, # The document index
content:str, # The document content
src:Optional=None, # URL, filename, etc; defaults to `md5(content)` if not provided
kwargs:VAR_KEYWORD
)->tuple:
Create an ft format tuple for a single doc in Anthropic’s recommended format
We can now generate XML for one document in the suggested format:
mk_doc(1, doc, title="test")docs_xml
def docs_xml(
docs:list, # The content of each document
srcs:Optional=None, # URLs, filenames, etc; each one defaults to `md5(content)` if not provided
prefix:bool=False, # Include Anthropic's suggested prose intro?
details:Optional=None, # Optional list of dicts with additional attrs for each doc
title:str=None, # Optional title attr for Documents element
)->str:
Create an XML string containing docs in Anthropic’s recommended format
Putting it all together, we have our final XML format:
docs = [doc, 'And another one']
srcs = [None, 'doc.txt']
print(docs_xml(docs, srcs))<documents><document index="1"><src>
47e19350
</src><document-content>
This is a "sample"
</document-content></document><document index="2"><src>
doc.txt
</src><document-content>
And another one
</document-content></document></documents>
Context creation
Now that we can generate Anthropic’s XML format, let’s make it easy for a few common cases.
File list to context
For generating XML context from files, we’ll just read them as text and use the file names as src.
read_file
def read_file(
fname, max_size:NoneType=None, sigs_only:bool=False, nb:NoneType=None, out:bool=True, ids:bool=True,
nums:bool=False
):
Read file content, converting notebooks to XML if needed
files2ctx
def files2ctx(
fnames:list, # List of file names to add to context
srcs:Optional=None, # Use the labels instead of `fnames`
max_size:int=None, # Skip files larger than this (bytes)
out:bool=True, # Include notebook cell outputs?
ids:bool=True, # Include cell ids in notebooks?
nums:bool=False, # Include line numbers in notebook cell source?
sigs_only:bool=False, # For .py files, only include signatures and docstrings
prefix:bool=False, # Include Anthropic's suggested prose intro?
details:Optional=None, # Optional list of dicts with additional attrs for each doc
title:str=None, # Optional title attr for Documents element
)->str: # XML for LM context
Convert files to XML context, handling notebooks
fnames = ['samples/sample_core.py', 'samples/sample_styles.css']
hl_md(files2ctx(fnames, max_size=120))<documents><document index="1"><src>
samples/sample_core.py
</src><document-content>
[Skipped: sample_core.py exceeds 120 bytes]
</document-content></document><document index="2"><src>
samples/sample_styles.css
</src><document-content>
.cell { margin-bottom: 1rem; }
.cell > .sourceCode { margin-bottom: 0; }
.cell-output > pre { margin-bottom: 0; }
</document-content></document></documents>Folder to context
folder2ctx
def folder2ctx(
path:Union, # Folder to read
prefix:bool=False, # Include Anthropic's suggested prose intro?
out:bool=True, # Include notebook cell outputs?
include_base:bool=True, # Include full path in src?
title:str=None, # Optional title attr for Documents element
max_size:int=100000, # Skip files larger than this (bytes)
max_total:int=10000000, # Max total output size in bytes
readme_first:bool=False, # Prioritize README files at start of context?
files_only:bool=False, # Return dict of {filename: size} instead of context?
sigs_only:bool=False, # Return signatures instead of full text for python files?
ids:bool=True, # Include cell ids in notebooks?
recursive:bool=True, # search subfolders
symlinks:bool=True, # follow symlinks?
file_glob:str=None, # Only include files matching glob
file_re:str=None, # Only include files matching regex
folder_re:str=None, # Only enter folders matching regex
skip_file_glob:str=None, # Skip files matching glob
skip_file_re:str=None, # Skip files matching regex
skip_folder_re:str=None, # Skip folders matching regex,
ret_folders:bool=False, # return folders, not just files
sort:bool=True, # sort files by name within each folder
types:str | list=None, # list or comma-separated str of ext types from: py, js, java, c, cpp, rb, r, ex, sh, web, doc, cfg
exts:str | list=None, # list or comma-separated str of exts to include
)->Union:
Convert folder contents to XML context, handling notebooks
print(folder2ctx('samples', prefix=True, types='py'))Here are some documents for you to reference for your task:
<documents><document index="1"><src>
samples/sample_core.py
</src><document-content>
import inspect
empty = inspect.Parameter.empty
models = 'claude-3-opus-20240229','claude-3-sonnet-20240229','claude-3-haiku-20240307'
</document-content></document></documents>
sym2file
def sym2file(
sym
):
Return md string with filepath and contents for a symbol’s source file
# from dialoghelper import *# add_msg(sym2file(Path))sym2folderctx
def sym2folderctx(
sym,
types:str | list='py', # list or comma-separated str of ext types from: py, js, java, c, cpp, rb, r, ex, sh, web, doc, cfg
skip_file_re:str='^_mod', prefix:bool=False, # Include Anthropic's suggested prose intro?
out:bool=True, # Include notebook cell outputs?
include_base:bool=True, # Include full path in src?
title:str=None, # Optional title attr for Documents element
max_size:int=100000, # Skip files larger than this (bytes)
max_total:int=10000000, # Max total output size in bytes
readme_first:bool=False, # Prioritize README files at start of context?
files_only:bool=False, # Return dict of {filename: size} instead of context?
sigs_only:bool=False, # Return signatures instead of full text for python files?
ids:bool=True, # Include cell ids in notebooks?
recursive:bool=True, # search subfolders
symlinks:bool=True, # follow symlinks?
file_glob:str=None, # Only include files matching glob
file_re:str=None, # Only include files matching regex
folder_re:str=None, # Only enter folders matching regex
skip_file_glob:str=None, # Skip files matching glob
skip_folder_re:str=None, # Skip folders matching regex,
ret_folders:bool=False, # return folders, not just files
sort:bool=True, # sort files by name within each folder
exts:str | list=None, # list or comma-separated str of exts to include
):
Return folder context for a symbol’s source file location
# add_msg(sym2folderctx(test_eq), msg_type='raw')sym2pkgpath
def sym2pkgpath(
sym
):
Get root package path for a symbol
sym2pkgpath(test_eq)Path('/Users/jhoward/aai-ws/fastcore/fastcore')
sym2pkgctx
def sym2pkgctx(
sym,
types:str | list='py', # list or comma-separated str of ext types from: py, js, java, c, cpp, rb, r, ex, sh, web, doc, cfg
skip_file_re:str='^_mod', prefix:bool=False, # Include Anthropic's suggested prose intro?
out:bool=True, # Include notebook cell outputs?
include_base:bool=True, # Include full path in src?
title:str=None, # Optional title attr for Documents element
max_size:int=100000, # Skip files larger than this (bytes)
max_total:int=10000000, # Max total output size in bytes
readme_first:bool=False, # Prioritize README files at start of context?
files_only:bool=False, # Return dict of {filename: size} instead of context?
sigs_only:bool=False, # Return signatures instead of full text for python files?
ids:bool=True, # Include cell ids in notebooks?
recursive:bool=True, # search subfolders
symlinks:bool=True, # follow symlinks?
file_glob:str=None, # Only include files matching glob
file_re:str=None, # Only include files matching regex
folder_re:str=None, # Only enter folders matching regex
skip_file_glob:str=None, # Skip files matching glob
skip_folder_re:str=None, # Skip folders matching regex,
ret_folders:bool=False, # return folders, not just files
sort:bool=True, # sort files by name within each folder
exts:str | list=None, # list or comma-separated str of exts to include
):
Return repo context for a symbol’s root package
# add_msg(sym2pkgctx(tqdm), msg_type='raw')After you install toolslm, folder2ctx becomes available from the command line.
!folder2ctx -husage: folder2ctx [-h] [--path PATH] [--no-out] [--prefix] [--no-include_base]
[--title TITLE] [--max_size MAX_SIZE] [--max_total MAX_TOTAL]
[--readme_first] [--files_only] [--sigs_only] [--no-ids]
[--no-recursive] [--no-symlinks] [--file_glob FILE_GLOB]
[--file_re FILE_RE] [--folder_re FOLDER_RE]
[--skip_file_glob SKIP_FILE_GLOB]
[--skip_file_re SKIP_FILE_RE]
[--skip_folder_re SKIP_FOLDER_RE] [--ret_folders] [--no-sort]
[--types TYPES] [--exts EXTS]
CLI to convert folder contents to XML context, handling notebooks
options:
-h, --help show this help message and exit
--path PATH Folder name containing files to add to
context (default: .)
--no-out Include notebook cell outputs? (default:
True)
--prefix Include Anthropic's suggested prose intro?
(default: False)
--no-include_base Include full path in src? (default: True)
--title TITLE Optional title attr for Documents element
--max_size MAX_SIZE Skip files larger than this (bytes) (default:
100000)
--max_total MAX_TOTAL Max total output size in bytes (default:
10000000)
--readme_first Prioritize README files at start of context?
(default: False)
--files_only Return dict of {filename: size} instead of
context? (default: False)
--sigs_only Return signatures instead of full text for
python files? (default: False)
--no-ids Include cell ids in notebooks? (default:
True)
--no-recursive search subfolders (default: True)
--no-symlinks follow symlinks? (default: True)
--file_glob FILE_GLOB Only include files matching glob
--file_re FILE_RE Only include files matching regex
--folder_re FOLDER_RE Only enter folders matching regex
--skip_file_glob SKIP_FILE_GLOB Skip files matching glob
--skip_file_re SKIP_FILE_RE Skip files matching regex
--skip_folder_re SKIP_FOLDER_RE Skip folders matching regex,
--ret_folders return folders, not just files (default:
False)
--no-sort sort files by name within each folder
(default: True)
--types TYPES list or comma-separated str of ext types
from: py, js, java, c, cpp, rb, r, ex, sh,
web, doc, cfg
--exts EXTS list or comma-separated str of exts to
include
parse_gh_url
def parse_gh_url(
url
):
Parse GitHub URL into (owner, repo, type, ref, path) or None
repo2ctx
def repo2ctx(
owner:str, # GitHub repo owner or "owner/repo" or a full github URL
repo:str=None, # GitHub repo name (leave empty if using "owner/repo" or URL format for owner param)
ref:str=None, # Git ref (branch/tag/sha) (get from URL not provided); defaults to repo's default branch
folder:str=None, # Only include files under this path (get from URL not provided)
show_filters:bool=True, # Include filter info in title?
token:str=None, # GitHub token (uses GITHUB_TOKEN env var if None)
prefix:bool=False, # Include Anthropic's suggested prose intro?
out:bool=True, # Include notebook cell outputs?
include_base:bool=True, # Include full path in src?
title:str=None, # Optional title attr for Documents element
max_size:int=100000, # Skip files larger than this (bytes)
max_total:int=10000000, # Max total output size in bytes
readme_first:bool=False, # Prioritize README files at start of context?
files_only:bool=False, # Return dict of {filename: size} instead of context?
sigs_only:bool=False, # Return signatures instead of full text for python files?
ids:bool=True, # Include cell ids in notebooks?
recursive:bool=True, # search subfolders
symlinks:bool=True, # follow symlinks?
file_glob:str=None, # Only include files matching glob
file_re:str=None, # Only include files matching regex
folder_re:str=None, # Only enter folders matching regex
skip_file_glob:str=None, # Skip files matching glob
skip_file_re:str=None, # Skip files matching regex
skip_folder_re:str=None, # Skip folders matching regex,
ret_folders:bool=False, # return folders, not just files
sort:bool=True, # sort files by name within each folder
types:str | list=None, # list or comma-separated str of ext types from: py, js, java, c, cpp, rb, r, ex, sh, web, doc, cfg
exts:str | list=None, # list or comma-separated str of exts to include
)->Union: # XML for LM context, or dict of file sizes
Convert GitHub repo to XML context without cloning
print(repo2ctx('answerdotai/toolslm', exts=('ipynb','py'), skip_file_re='^_', out=False, max_total=500))<documents title="GitHub repository contents from answerdotai/toolslm/main (filters applied -- exts: ipynb, py | skip_file_re: ^_ | max_total: 500)"><document index="1"><src>
00_xml.ipynb
</src><document-content>
<notebook><code id="efe78920">#|default_exp xml</code><md id="87ea05a3"># xml source</md><code id="033c76fd">#| export
import hashlib, inspect, xml.etree.ElementTree as ET, ast
from collections import namedtuple
from ghapi.all
[TRUNCATED: output size 104276 exceeded max size 500 bytes]
print(repo2ctx('answerdotai/toolslm', types='py', skip_file_re='^_', out=False, files_only=True)){'00_xml.ipynb': 39144, '01_funccall.ipynb': 66233, '02_shell.ipynb': 6295, '03_download.ipynb': 12178, '04_md_hier.ipynb': 8091, 'index.ipynb': 3089, 'setup.py': 2596, 'samples/sample_core.py': 134, 'toolslm/download.py': 4481, 'toolslm/funccall.py': 11378, 'toolslm/md_hier.py': 11010, 'toolslm/shell.py': 1566, 'toolslm/xml.py': 12837}
print(repo2ctx('https://github.com/AnswerDotAI/toolslm/tree/main/samples'))<documents title="GitHub repository contents from AnswerDotAI/toolslm/main/samples"><document index="1"><src>
sample_core.py
</src><document-content>
import inspect
empty = inspect.Parameter.empty
models = 'claude-3-opus-20240229','claude-3-sonnet-20240229','claude-3-haiku-20240307'
</document-content></document><document index="2"><src>
sample_styles.css
</src><document-content>
.cell { margin-bottom: 1rem; }
.cell > .sourceCode { margin-bottom: 0; }
.cell-output > pre { margin-bottom: 0; }
</document-content></document></documents>