xml source

from copy import deepcopy

from fastcore.test import *

JSON to XML


source

json_to_xml


def json_to_xml(
    d:dict, # JSON dictionary to convert
    rnm:str, # Root name
)->str:

Convert d to XML.

JSON doesn’t map as nicely to XML as the data structure used in fastcore.xml, but for simple XML trees it can be convenient – for example:

a = dict(surname='Howard', firstnames=['Jeremy','Peter'],
         address=dict(state='Queensland',country='Australia'))
hl_md(json_to_xml(a, 'person'))
<person>
  <surname>Howard</surname>
  <firstnames>
    <item>Jeremy</item>
    <item>Peter</item>
  </firstnames>
  <address>
    <state>Queensland</state>
    <country>Australia</country>
  </address>
</person>

Including documents

Notebooks

nbp = Path('00_xml.ipynb')
nb = dict2obj(nbp.read_json())
cells = nb.cells
cell = cells[7]
cell
{ 'cell_type': 'code',
  'execution_count': {},
  'id': '005a5be4',
  'metadata': {},
  'outputs': [],
  'source': ["a = dict(surname='Howard', firstnames=['Jeremy','Peter'],\n", "         address=dict(state='Queensland',country='Australia'))\n", "hl_md(json_to_xml(a, 'person'))"]}

source

get_mime_text


def get_mime_text(
    data
):

Get text from MIME bundle, preferring markdown over plain


source

cell2out


def cell2out(
    o
):

Convert single notebook output to XML format

for o in cell.outputs: print(to_xml(cell2out(o), do_escape=False))

source

cell2xml


def cell2xml(
    cell, out:bool=True, ids:bool=True, nums:bool=False
):

Convert notebook cell to concise XML format

hl_md(cell2xml(cell, out=False, nums=True))
<code id="005a5be4">     1 │ a = dict(surname='Howard', firstnames=['Jeremy','Peter'],
     2 │          address=dict(state='Queensland',country='Australia'))
     3 │ hl_md(json_to_xml(a, 'person'))</code>

source

nb2xml


def nb2xml(
    fname:NoneType=None, nb:NoneType=None, out:bool=True, ids:bool=True, nums:bool=False
):

Convert notebook to XML format


source

cells2xml


def cells2xml(
    cells, wrap:function=<function __getattr__.<locals>._f at 0x7f9c3fb7e7a0>, out:bool=True, ids:bool=True,
    nums:bool=False
):

Convert notebook to XML format

nbsml = deepcopy(nb)
del(nbsml.cells[2:])

hl_md(nb2xml(nb=nbsml))
<notebook><code id="efe78920"><source>#|default_exp xml</code><md id="87ea05a3"><source># xml source</md></notebook>
hl_md(nb2xml(nb=nbsml, ids=False))
<notebook><code><source>#|default_exp xml</code><md><source># xml source</md></notebook>

source

py2sigs


def py2sigs(
    fname:NoneType=None, src:NoneType=None
):

Return signature+docstring text for all functions and class methods in source


source

get_docstring


def get_docstring(
    node, lines
):

Get docstring from source lines if present

Documents

According to Anthropic, “it’s essential to structure your prompts in a way that clearly separates the input data from the instructions”. They recommend using something like the following:

Here are some documents for you to reference for your task:
    
<documents>
<document index="1">
<source>
(URL, file name, hash, etc)
</source>
<document_content>
(the text content)
</document_content>
</document>
</documents>

We will create some small helper functions to make it easier to generate context in this format, although we’re use <src> instead of <source> to avoid conflict with that HTML tag. Although it’s based on Anthropic’s recommendation, it’s likely to work well with other models too.

We’ll use doctype to store our pairs.

Since Anthropic’s example shows newlines before and after each tag, we’ll do the same.

to_xml(Src('a'))
'<src>a</src>'
to_xml(Document('a'))
'<document>a</document>'

source

mk_doctype


def mk_doctype(
    content:str, # The document content
    src:Optional=None, # URL, filename, etc; defaults to `md5(content)` if not provided
)->namedtuple:

Create a doctype named tuple

This is a convenience wrapper to ensure that a doctype has the needed information in the right format.

doc = 'This is a "sample"'
mk_doctype(doc)
doctype(src='\n47e19350\n', content='\nThis is a "sample"\n')

source

mk_doc


def mk_doc(
    index:int, # The document index
    content:str, # The document content
    src:Optional=None, # URL, filename, etc; defaults to `md5(content)` if not provided
    kwargs:VAR_KEYWORD
)->tuple:

Create an ft format tuple for a single doc in Anthropic’s recommended format

We can now generate XML for one document in the suggested format:

mk_doc(1, doc, title="test")
47e19350 This is a "sample"

source

docs_xml


def docs_xml(
    docs:list, # The content of each document
    srcs:Optional=None, # URLs, filenames, etc; each one defaults to `md5(content)` if not provided
    prefix:bool=False, # Include Anthropic's suggested prose intro?
    details:Optional=None, # Optional list of dicts with additional attrs for each doc
    title:str=None, # Optional title attr for Documents element
)->str:

Create an XML string containing docs in Anthropic’s recommended format

Putting it all together, we have our final XML format:

docs = [doc, 'And another one']
srcs = [None, 'doc.txt']
print(docs_xml(docs, srcs))
<documents><document index="1"><src>
47e19350
</src><document-content>
This is a "sample"
</document-content></document><document index="2"><src>
doc.txt
</src><document-content>
And another one
</document-content></document></documents>

Context creation

Now that we can generate Anthropic’s XML format, let’s make it easy for a few common cases.

File list to context

For generating XML context from files, we’ll just read them as text and use the file names as src.


source

read_file


def read_file(
    fname, max_size:NoneType=None, sigs_only:bool=False, nb:NoneType=None, out:bool=True, ids:bool=True,
    nums:bool=False
):

Read file content, converting notebooks to XML if needed


source

files2ctx


def files2ctx(
    fnames:list, # List of file names to add to context
    srcs:Optional=None, # Use the labels instead of `fnames`
    max_size:int=None, # Skip files larger than this (bytes)
    out:bool=True, # Include notebook cell outputs?
    ids:bool=True, # Include cell ids in notebooks?
    nums:bool=False, # Include line numbers in notebook cell source?
    sigs_only:bool=False, # For .py files, only include signatures and docstrings
    prefix:bool=False, # Include Anthropic's suggested prose intro?
    details:Optional=None, # Optional list of dicts with additional attrs for each doc
    title:str=None, # Optional title attr for Documents element
)->str: # XML for LM context

Convert files to XML context, handling notebooks

fnames = ['samples/sample_core.py', 'samples/sample_styles.css']
hl_md(files2ctx(fnames, max_size=120))
<documents><document index="1"><src>
samples/sample_core.py
</src><document-content>
[Skipped: sample_core.py exceeds 120 bytes]
</document-content></document><document index="2"><src>
samples/sample_styles.css
</src><document-content>
.cell { margin-bottom: 1rem; }
.cell > .sourceCode { margin-bottom: 0; }
.cell-output > pre { margin-bottom: 0; }
</document-content></document></documents>

Folder to context


source

folder2ctx


def folder2ctx(
    path:Union, # Folder to read
    prefix:bool=False, # Include Anthropic's suggested prose intro?
    out:bool=True, # Include notebook cell outputs?
    include_base:bool=True, # Include full path in src?
    title:str=None, # Optional title attr for Documents element
    max_size:int=100000, # Skip files larger than this (bytes)
    max_total:int=10000000, # Max total output size in bytes
    readme_first:bool=False, # Prioritize README files at start of context?
    files_only:bool=False, # Return dict of {filename: size} instead of context?
    sigs_only:bool=False, # Return signatures instead of full text for python files?
    ids:bool=True, # Include cell ids in notebooks?
    recursive:bool=True, # search subfolders
    symlinks:bool=True, # follow symlinks?
    file_glob:str=None, # Only include files matching glob
    file_re:str=None, # Only include files matching regex
    folder_re:str=None, # Only enter folders matching regex
    skip_file_glob:str=None, # Skip files matching glob
    skip_file_re:str=None, # Skip files matching regex
    skip_folder_re:str=None, # Skip folders matching regex,
    ret_folders:bool=False, # return folders, not just files
    sort:bool=True, # sort files by name within each folder
    types:str | list=None, # list or comma-separated str of ext types from: py, js, java, c, cpp, rb, r, ex, sh, web, doc, cfg
    exts:str | list=None, # list or comma-separated str of exts to include
)->Union:

Convert folder contents to XML context, handling notebooks

print(folder2ctx('samples', prefix=True, types='py'))
Here are some documents for you to reference for your task:

<documents><document index="1"><src>
samples/sample_core.py
</src><document-content>
import inspect
empty = inspect.Parameter.empty
models = 'claude-3-opus-20240229','claude-3-sonnet-20240229','claude-3-haiku-20240307'
</document-content></document></documents>

source

sym2file


def sym2file(
    sym
):

Return md string with filepath and contents for a symbol’s source file

# from dialoghelper import *
# add_msg(sym2file(Path))

source

sym2folderctx


def sym2folderctx(
    sym,
    types:str | list='py', # list or comma-separated str of ext types from: py, js, java, c, cpp, rb, r, ex, sh, web, doc, cfg
    skip_file_re:str='^_mod', prefix:bool=False, # Include Anthropic's suggested prose intro?
    out:bool=True, # Include notebook cell outputs?
    include_base:bool=True, # Include full path in src?
    title:str=None, # Optional title attr for Documents element
    max_size:int=100000, # Skip files larger than this (bytes)
    max_total:int=10000000, # Max total output size in bytes
    readme_first:bool=False, # Prioritize README files at start of context?
    files_only:bool=False, # Return dict of {filename: size} instead of context?
    sigs_only:bool=False, # Return signatures instead of full text for python files?
    ids:bool=True, # Include cell ids in notebooks?
    recursive:bool=True, # search subfolders
    symlinks:bool=True, # follow symlinks?
    file_glob:str=None, # Only include files matching glob
    file_re:str=None, # Only include files matching regex
    folder_re:str=None, # Only enter folders matching regex
    skip_file_glob:str=None, # Skip files matching glob
    skip_folder_re:str=None, # Skip folders matching regex,
    ret_folders:bool=False, # return folders, not just files
    sort:bool=True, # sort files by name within each folder
    exts:str | list=None, # list or comma-separated str of exts to include
):

Return folder context for a symbol’s source file location

# add_msg(sym2folderctx(test_eq), msg_type='raw')

source

sym2pkgpath


def sym2pkgpath(
    sym
):

Get root package path for a symbol

sym2pkgpath(test_eq)
Path('/Users/jhoward/aai-ws/fastcore/fastcore')

source

sym2pkgctx


def sym2pkgctx(
    sym,
    types:str | list='py', # list or comma-separated str of ext types from: py, js, java, c, cpp, rb, r, ex, sh, web, doc, cfg
    skip_file_re:str='^_mod', prefix:bool=False, # Include Anthropic's suggested prose intro?
    out:bool=True, # Include notebook cell outputs?
    include_base:bool=True, # Include full path in src?
    title:str=None, # Optional title attr for Documents element
    max_size:int=100000, # Skip files larger than this (bytes)
    max_total:int=10000000, # Max total output size in bytes
    readme_first:bool=False, # Prioritize README files at start of context?
    files_only:bool=False, # Return dict of {filename: size} instead of context?
    sigs_only:bool=False, # Return signatures instead of full text for python files?
    ids:bool=True, # Include cell ids in notebooks?
    recursive:bool=True, # search subfolders
    symlinks:bool=True, # follow symlinks?
    file_glob:str=None, # Only include files matching glob
    file_re:str=None, # Only include files matching regex
    folder_re:str=None, # Only enter folders matching regex
    skip_file_glob:str=None, # Skip files matching glob
    skip_folder_re:str=None, # Skip folders matching regex,
    ret_folders:bool=False, # return folders, not just files
    sort:bool=True, # sort files by name within each folder
    exts:str | list=None, # list or comma-separated str of exts to include
):

Return repo context for a symbol’s root package

# add_msg(sym2pkgctx(tqdm), msg_type='raw')
Tip

After you install toolslm, folder2ctx becomes available from the command line.

!folder2ctx -h
usage: folder2ctx [-h] [--path PATH] [--no-out] [--prefix] [--no-include_base]
                  [--title TITLE] [--max_size MAX_SIZE] [--max_total MAX_TOTAL]
                  [--readme_first] [--files_only] [--sigs_only] [--no-ids]
                  [--no-recursive] [--no-symlinks] [--file_glob FILE_GLOB]
                  [--file_re FILE_RE] [--folder_re FOLDER_RE]
                  [--skip_file_glob SKIP_FILE_GLOB]
                  [--skip_file_re SKIP_FILE_RE]
                  [--skip_folder_re SKIP_FOLDER_RE] [--ret_folders] [--no-sort]
                  [--types TYPES] [--exts EXTS]

CLI to convert folder contents to XML context, handling notebooks

options:
  -h, --help                       show this help message and exit
  --path PATH                      Folder name containing files to add to
                                   context (default: .)
  --no-out                         Include notebook cell outputs? (default:
                                   True)
  --prefix                         Include Anthropic's suggested prose intro?
                                   (default: False)
  --no-include_base                Include full path in src? (default: True)
  --title TITLE                    Optional title attr for Documents element
  --max_size MAX_SIZE              Skip files larger than this (bytes) (default:
                                   100000)
  --max_total MAX_TOTAL            Max total output size in bytes (default:
                                   10000000)
  --readme_first                   Prioritize README files at start of context?
                                   (default: False)
  --files_only                     Return dict of {filename: size} instead of
                                   context? (default: False)
  --sigs_only                      Return signatures instead of full text for
                                   python files? (default: False)
  --no-ids                         Include cell ids in notebooks? (default:
                                   True)
  --no-recursive                   search subfolders (default: True)
  --no-symlinks                    follow symlinks? (default: True)
  --file_glob FILE_GLOB            Only include files matching glob
  --file_re FILE_RE                Only include files matching regex
  --folder_re FOLDER_RE            Only enter folders matching regex
  --skip_file_glob SKIP_FILE_GLOB  Skip files matching glob
  --skip_file_re SKIP_FILE_RE      Skip files matching regex
  --skip_folder_re SKIP_FOLDER_RE  Skip folders matching regex,
  --ret_folders                    return folders, not just files (default:
                                   False)
  --no-sort                        sort files by name within each folder
                                   (default: True)
  --types TYPES                    list or comma-separated str of ext types
                                   from: py, js, java, c, cpp, rb, r, ex, sh,
                                   web, doc, cfg
  --exts EXTS                      list or comma-separated str of exts to
                                   include

source

parse_gh_url


def parse_gh_url(
    url
):

Parse GitHub URL into (owner, repo, type, ref, path) or None


source

repo2ctx


def repo2ctx(
    owner:str, # GitHub repo owner or "owner/repo" or a full github URL
    repo:str=None, # GitHub repo name (leave empty if using "owner/repo" or URL format for owner param)
    ref:str=None, # Git ref (branch/tag/sha) (get from URL not provided); defaults to repo's default branch
    folder:str=None, # Only include files under this path (get from URL not provided)
    show_filters:bool=True, # Include filter info in title?
    token:str=None, # GitHub token (uses GITHUB_TOKEN env var if None)
    prefix:bool=False, # Include Anthropic's suggested prose intro?
    out:bool=True, # Include notebook cell outputs?
    include_base:bool=True, # Include full path in src?
    title:str=None, # Optional title attr for Documents element
    max_size:int=100000, # Skip files larger than this (bytes)
    max_total:int=10000000, # Max total output size in bytes
    readme_first:bool=False, # Prioritize README files at start of context?
    files_only:bool=False, # Return dict of {filename: size} instead of context?
    sigs_only:bool=False, # Return signatures instead of full text for python files?
    ids:bool=True, # Include cell ids in notebooks?
    recursive:bool=True, # search subfolders
    symlinks:bool=True, # follow symlinks?
    file_glob:str=None, # Only include files matching glob
    file_re:str=None, # Only include files matching regex
    folder_re:str=None, # Only enter folders matching regex
    skip_file_glob:str=None, # Skip files matching glob
    skip_file_re:str=None, # Skip files matching regex
    skip_folder_re:str=None, # Skip folders matching regex,
    ret_folders:bool=False, # return folders, not just files
    sort:bool=True, # sort files by name within each folder
    types:str | list=None, # list or comma-separated str of ext types from: py, js, java, c, cpp, rb, r, ex, sh, web, doc, cfg
    exts:str | list=None, # list or comma-separated str of exts to include
)->Union: # XML for LM context, or dict of file sizes

Convert GitHub repo to XML context without cloning

print(repo2ctx('answerdotai/toolslm', exts=('ipynb','py'), skip_file_re='^_', out=False, max_total=500))
<documents title="GitHub repository contents from answerdotai/toolslm/main (filters applied -- exts: ipynb, py | skip_file_re: ^_ | max_total: 500)"><document index="1"><src>
00_xml.ipynb
</src><document-content>
<notebook><code id="efe78920">#|default_exp xml</code><md id="87ea05a3"># xml source</md><code id="033c76fd">#| export
import hashlib, inspect, xml.etree.ElementTree as ET, ast
from collections import namedtuple
from ghapi.all

[TRUNCATED: output size 104276 exceeded max size 500 bytes]
print(repo2ctx('answerdotai/toolslm', types='py', skip_file_re='^_', out=False, files_only=True))
{'00_xml.ipynb': 39144, '01_funccall.ipynb': 66233, '02_shell.ipynb': 6295, '03_download.ipynb': 12178, '04_md_hier.ipynb': 8091, 'index.ipynb': 3089, 'setup.py': 2596, 'samples/sample_core.py': 134, 'toolslm/download.py': 4481, 'toolslm/funccall.py': 11378, 'toolslm/md_hier.py': 11010, 'toolslm/shell.py': 1566, 'toolslm/xml.py': 12837}
print(repo2ctx('https://github.com/AnswerDotAI/toolslm/tree/main/samples'))
<documents title="GitHub repository contents from AnswerDotAI/toolslm/main/samples"><document index="1"><src>
sample_core.py
</src><document-content>
import inspect
empty = inspect.Parameter.empty
models = 'claude-3-opus-20240229','claude-3-sonnet-20240229','claude-3-haiku-20240307'
</document-content></document><document index="2"><src>
sample_styles.css
</src><document-content>
.cell { margin-bottom: 1rem; }
.cell > .sourceCode { margin-bottom: 0; }
.cell-output > pre { margin-bottom: 0; }
</document-content></document></documents>