blob: 4f227b2369f704c2fbacb9246bb05f454aa74fb7 [file] [log] [blame]
#!/usr/bin/env python3
from datetime import date
from io import StringIO
from lxml import etree
import markdown
from markdown.extensions import Extension
import re
import sys
import yaml
# Prevent our markdown parser from trying to help by interpreting things in angle brackets as HTML tags.
class EscapeHtml(Extension):
def extendMarkdown(self, md):
md.preprocessors.deregister('html_block')
md.inlinePatterns.deregister('html')
class RoffWalker(object):
def __init__(self, tree, output=sys.stdout):
self.tree = tree
self.target = output
self.f = StringIO()
def walk(self):
self._walk(self.tree, parent_tag=None)
# We don't want to start lines with \. because that can confuse man
# For lines that start with \., we need to prefix them with \& so it
# knows not to treat that line as a directive
data = re.sub(r'^\\\.', r'\&.', self.f.getvalue(), flags=re.MULTILINE)
self.target.write(data)
def _ul_is_special(self, root):
if len(root) != 1:
return False
child = root[0]
if child.tag != 'li':
return False
msg = ''.join(child.itertext()).strip()
return msg.endswith(':')
def _walk_child(self, root):
if len(root) > 0:
self._walk(root[0], parent_tag=root.tag)
def _write_element(self, root, ensure_newline=True):
if root.text is not None:
text = self._sanitize(root.text)
self.__write_raw(text)
self._walk_child(root)
self._write_tail(root, ensure_newline=ensure_newline)
def _write_tail(self, root, ensure_newline=False, inline=False):
if root.tail is not None:
if inline or root.tail != '\n':
text = self._sanitize(root.tail)
if text.endswith('\n'):
ensure_newline = False
self.__write_raw(text)
if ensure_newline:
self.__write_raw('\n')
def _walk(self, root, parent_tag=None):
last_tag = None
while root is not None:
if root.tag == 'h1':
self.__write_cmd('.TH "JQ" "1" "{}" "" ""'.format(
date.today().strftime('%B %Y')))
self.__write_cmd('.SH "NAME"')
# TODO: properly parse this
self.__write_raw(r'\fBjq\fR \- Command\-line JSON processor' +
"\n")
elif root.tag == 'h2':
self.__write_cmd('.SH "{}"'.format(''.join(
root.itertext()).strip()))
elif root.tag == 'h3':
text = ''.join(root.itertext()).strip()
self.__write_cmd('.SS "{}"'.format(self._h3_sanitize(text)))
elif root.tag == 'p':
if last_tag not in ['h2', 'h3'] and parent_tag not in ['li']:
self.__write_cmd('.P')
self._write_element(root, ensure_newline=(parent_tag != 'li'))
elif root.tag == 'ul':
if self._ul_is_special(root):
li = root[0]
self.__write_cmd('.TP')
self._write_element(li)
next = root.getnext()
while next is not None and next.tag == 'p':
if next.getnext() is not None and next.getnext(
).tag == 'pre':
# we don't want to .IP these, because it'll look funny with the code indent
break
self.__write_cmd('.IP')
self._write_element(next)
root = next
next = root.getnext()
else:
self._walk_child(root)
self._write_tail(root)
# A pre tag after the end of a list doesn't want two of the indentation commands
if root.getnext() is None or root.getnext().tag != 'pre':
self.__write_cmd('.IP "" 0')
elif root.tag == 'li':
self.__write_cmd(r'.IP "\(bu" 4')
if root.text is not None and root.text.strip() != '':
text = self._sanitize(root.text)
self.__write_raw(text)
self._walk_child(root)
self._write_tail(root, ensure_newline=True)
elif root.tag == 'strong':
if root.text is not None:
text = self._sanitize(root.text)
self.__write_raw('\\fB{}\\fR'.format(text))
self._write_tail(root, inline=True)
elif root.tag == 'em':
if root.text is not None:
text = self._sanitize(root.text)
self.__write_raw('\\fI{}\\fR'.format(text))
self._write_tail(root, inline=True)
elif root.tag == 'code':
if root.text is not None:
text = self._code_sanitize(root.text)
self.__write_raw('\\fB{}\\fR'.format(text))
self._write_tail(root, inline=True)
elif root.tag == 'pre':
self.__write_cmd('.IP "" 4')
self.__write_cmd('.nf\n') # extra newline for spacing reasons
next = root
first = True
while next is not None and next.tag == 'pre':
if not first:
self.__write_raw('\n')
text = ''.join(next.itertext(with_tail=False))
self.__write_raw(self._pre_sanitize(text))
first = False
root = next
next = next.getnext()
self.__write_cmd('.fi')
self.__write_cmd('.IP "" 0')
else:
self._walk_child(root)
last_tag = root.tag
root = root.getnext()
def _base_sanitize(self, text):
text = re.sub(r'\\', r'\\e', text)
text = re.sub(r'\.', r'\\.', text)
text = re.sub("'", r"\'", text)
text = re.sub('-', r'\-', text)
return text
def _pre_sanitize(self, text):
return self._base_sanitize(text)
def _code_sanitize(self, text):
text = self._base_sanitize(text)
text = re.sub(r'\s', ' ', text)
return text
def _h3_sanitize(self, text):
text = self._base_sanitize(text)
text = re.sub(' \n|\n ', ' ', text)
text = re.sub('\n', ' ', text)
return text
def _sanitize(self, text):
text = self._base_sanitize(text)
text = re.sub(r'<([^>]+)>', r'\\fI\1\\fR', text)
text = re.sub(r' +', ' ', text)
text = re.sub('\n', ' ', text)
return text
def __write_cmd(self, dat):
print('.', dat, sep='\n', file=self.f)
pass
def __write_raw(self, dat):
print(dat, sep='', end='', file=self.f)
pass
def load_yml_file(fn):
with open(fn) as f:
return yaml.safe_load(f)
def dedent_body(body):
lines = [re.sub(r'^ (\S)', r'\1', l) for l in body.split('\n')]
return '\n'.join(lines)
def convert_manual_to_markdown():
f = StringIO()
manual = load_yml_file("content/manual/manual.yml")
f.write(manual.get('manpage_intro', '\n'))
f.write(dedent_body(manual.get('body', '\n')))
for section in manual.get('sections', []):
f.write('## {}\n'.format(section.get('title', '').upper()))
f.write(dedent_body(section.get('body', '\n')))
f.write('\n')
for entry in section.get('entries', []):
f.write('### {}\n'.format(entry.get('title', '')))
f.write(dedent_body(entry.get('body', '\n')))
f.write('\n')
if entry.get('examples') is not None:
f.write("~~~~\n")
first = True
for example in entry.get('examples'):
if not first:
f.write('\n')
f.write("jq '{}'\n".format(example.get('program', '')))
f.write(" {}\n".format(example.get('input', '')))
output = [str(x) for x in example.get('output', [])]
f.write("=> {}\n".format(', '.join(output)))
first = False
f.write("~~~~\n")
f.write('\n')
f.write(manual.get('manpage_epilogue', ''))
return f.getvalue()
# Convert manual.yml to our special markdown format
markdown_data = convert_manual_to_markdown()
# Convert markdown to html
html_data = markdown.markdown(markdown_data,
extensions=[EscapeHtml(), 'fenced_code'])
# Parse the html into a tree so we can walk it
tr = etree.HTML(html_data, etree.HTMLParser())
# Convert the markdown to ROFF
RoffWalker(tr).walk()