xml source

Setup


source

json_to_xml

 json_to_xml (d:dict, rnm:str)

Convert d to XML.

Type Details
d dict JSON dictionary to convert
rnm str Root name
Returns str
Exported source
def json_to_xml(d:dict, # JSON dictionary to convert
                rnm:str # Root name
               )->str:
    "Convert `d` to XML."
    root = ET.Element(rnm)
    def build_xml(data, parent):
        if isinstance(data, dict):
            for key, value in data.items(): build_xml(value, ET.SubElement(parent, key))
        elif isinstance(data, list):
            for item in data: build_xml(item, ET.SubElement(parent, 'item'))
        else: parent.text = str(data)
    build_xml(d, root)
    ET.indent(root)
    return ET.tostring(root, encoding='unicode')

JSON doesn’t map as nicely to XML as the data structure used in fastcore.xml, but for simple XML trees it can be convenient – for example:

a = dict(surname='Howard', firstnames=['Jeremy','Peter'],
         address=dict(state='Queensland',country='Australia'))
hl_md(json_to_xml(a, 'person'))
<person>
  <surname>Howard</surname>
  <firstnames>
    <item>Jeremy</item>
    <item>Peter</item>
  </firstnames>
  <address>
    <state>Queensland</state>
    <country>Australia</country>
  </address>
</person>

Including documents

According to Anthropic, “it’s essential to structure your prompts in a way that clearly separates the input data from the instructions”. They recommend using something like the following:

Here are some documents for you to reference for your task:
    
<documents>
<document index="1">
<source>
(URL, file name, hash, etc)
</source>
<document_content>
(the text content)
</document_content>
</document>
</documents>

We will create some small helper functions to make it easier to generate context in this format, although we’re use <src> instead of <source> to avoid conflict with that HTML tag. Although it’s based on Anthropic’s recommendation, it’s likely to work well with other models too.

Exported source
doctype = namedtuple('doctype', ['src', 'content'])

We’ll use doctype to store our pairs.

Exported source
def _add_nls(s):
    "Add newlines to start and end of `s` if missing"
    if not s: return s
    if s[ 0]!='\n': s = '\n'+s
    if s[-1]!='\n': s = s+'\n'
    return s

Since Anthropic’s example shows newlines before and after each tag, we’ll do the same.

to_xml(Src('a'))
'<src>a</src>'
to_xml(Document('a'))
'<document>a</document>'
to_xml(Documents('a'))
'<documents>a</documents>'

source

mk_doctype

 mk_doctype (content:str, src:Optional[str]=None)

Create a doctype named tuple

Type Default Details
content str The document content
src Optional None URL, filename, etc; defaults to md5(content) if not provided
Returns namedtuple
Exported source
def mk_doctype(content:str,  # The document content
           src:Optional[str]=None # URL, filename, etc; defaults to `md5(content)` if not provided
          ) -> namedtuple:
    "Create a `doctype` named tuple"
    if src is None: src = hashlib.md5(content.encode()).hexdigest()[:8]
    return doctype(_add_nls(str(src).strip()), _add_nls(content.strip()))

This is a convenience wrapper to ensure that a doctype has the needed information in the right format.

doc = 'This is a "sample"'
mk_doctype(doc)
doctype(src='\n47e19350\n', content='\nThis is a "sample"\n')

source

mk_doc

 mk_doc (index:int, content:str, src:Optional[str]=None, **kwargs)

Create an ft format tuple for a single doc in Anthropic’s recommended format

Type Default Details
index int The document index
content str The document content
src Optional None URL, filename, etc; defaults to md5(content) if not provided
kwargs
Returns tuple
Exported source
def mk_doc(index:int,  # The document index
           content:str,  # The document content
           src:Optional[str]=None, # URL, filename, etc; defaults to `md5(content)` if not provided
           **kwargs
          ) -> tuple:
    "Create an `ft` format tuple for a single doc in Anthropic's recommended format"
    dt = mk_doctype(content, src)
    content = Document_content(NotStr(dt.content))
    src = Src(NotStr(dt.src))
    return Document(src, content, index=index, **kwargs)

We can now generate XML for one document in the suggested format:

mk_doc(1, doc, title="test")
<document index="1" title="test"><src>
47e19350
</src><document-content>
This is a "sample"
</document-content></document>

source

docs_xml

 docs_xml (docs:list[str], srcs:Optional[list]=None, prefix:bool=True,
           details:Optional[list]=None)

Create an XML string containing docs in Anthropic’s recommended format

Type Default Details
docs list The content of each document
srcs Optional None URLs, filenames, etc; each one defaults to md5(content) if not provided
prefix bool True Include Anthropic’s suggested prose intro?
details Optional None Optional list of dicts with additional attrs for each doc
Returns str
Exported source
def docs_xml(docs:list[str],  # The content of each document
             srcs:Optional[list]=None,  # URLs, filenames, etc; each one defaults to `md5(content)` if not provided
             prefix:bool=True, # Include Anthropic's suggested prose intro?
             details:Optional[list]=None # Optional list of dicts with additional attrs for each doc
            )->str:
    "Create an XML string containing `docs` in Anthropic's recommended format"
    pre = 'Here are some documents for you to reference for your task:\n\n' if prefix else ''
    if srcs is None: srcs = [None]*len(docs)
    if details is None: details = [{}]*len(docs)
    docs = (mk_doc(i+1, d, s, **kw) for i,(d,s,kw) in enumerate(zip(docs,srcs,details)))
    return pre + to_xml(Documents(docs))

Putting it all together, we have our final XML format:

docs = [doc, 'And another one']
srcs = [None, 'doc.txt']
print(docs_xml(docs, srcs))
Here are some documents for you to reference for your task:

<documents><document index="1"><src>
47e19350
</src><document-content>
This is a "sample"
</document-content></document><document index="2"><src>
doc.txt
</src><document-content>
And another one
</document-content></document></documents>

Context creation

Now that we can generate Anthropic’s XML format, let’s make it easy for a few common cases.

File list to context

For generating XML context from files, we’ll just read them as text and use the file names as src.


source

files2ctx

 files2ctx (fnames:list[typing.Union[str,pathlib.Path]], prefix:bool=True)
Type Default Details
fnames list List of file names to add to context
prefix bool True Include Anthropic’s suggested prose intro?
Returns str XML for LM context
Exported source
def files2ctx(
    fnames:list[Union[str,Path]], # List of file names to add to context
    prefix:bool=True # Include Anthropic's suggested prose intro?
)->str: # XML for LM context
    fnames = [Path(o) for o in fnames]
    contents = [o.read_text() for o in fnames]
    return docs_xml(contents, fnames, prefix=prefix)
fnames = ['samples/sample_core.py', 'samples/sample_styles.css']
hl_md(files2ctx(fnames))
Here are some documents for you to reference for your task:

<documents><document index="1"><src>
samples/sample_core.py
</src><document-content>
import inspect
empty = inspect.Parameter.empty
models = 'claude-3-opus-20240229','claude-3-sonnet-20240229','claude-3-haiku-20240307'
</document-content></document><document index="2"><src>
samples/sample_styles.css
</src><document-content>
.cell { margin-bottom: 1rem; }
.cell > .sourceCode { margin-bottom: 0; }
.cell-output > pre { margin-bottom: 0; }
</document-content></document></documents>

Folder to context


source

folder2ctx

 folder2ctx (folder:Union[str,pathlib.Path], prefix:bool=True,
             recursive:bool=True, symlinks:bool=True, file_glob:str=None,
             file_re:str=None, folder_re:str=None,
             skip_file_glob:str=None, skip_file_re:str=None,
             skip_folder_re:str=None, func:callable=<function join>,
             ret_folders:bool=False)
Type Default Details
folder Union Folder name containing files to add to context
prefix bool True Include Anthropic’s suggested prose intro?
recursive bool True search subfolders
symlinks bool True follow symlinks?
file_glob str None Only include files matching glob
file_re str None Only include files matching regex
folder_re str None Only enter folders matching regex
skip_file_glob str None Skip files matching glob
skip_file_re str None Skip files matching regex
skip_folder_re str None Skip folders matching regex,
func callable join function to apply to each matched file
ret_folders bool False return folders, not just files
Returns str XML for Claude context
Exported source
@delegates(globtastic)
def folder2ctx(
    folder:Union[str,Path], # Folder name containing files to add to context
    prefix:bool=True, # Include Anthropic's suggested prose intro?
    **kwargs # Passed to `globtastic`
)->str: # XML for Claude context
    fnames = globtastic(folder, **kwargs)
    return files2ctx(fnames, prefix=prefix)
print(folder2ctx('samples', prefix=False, file_glob='*.py'))
<documents><document index="1"><src>
samples/sample_core.py
</src><document-content>
import inspect
empty = inspect.Parameter.empty
models = 'claude-3-opus-20240229','claude-3-sonnet-20240229','claude-3-haiku-20240307'
</document-content></document></documents>
Tip

After you install toolslm, folder2ctx becomes available from the command line. You can see how to use it with the following command:

folder2ctx -h