docs/build_manpage.py - third_party/github.com/jqlang/jq - Git at Google

 #!/usr/bin/env python3
 from datetime import date
 from io import StringIO
 from lxml import etree
 import markdown
 from markdown.extensions import Extension
 import re
 import sys
 import yaml


 # Prevent our markdown parser from trying to help by interpreting things in angle brackets as HTML tags.
 class EscapeHtml(Extension):

     def extendMarkdown(self, md):
         md.preprocessors.deregister('html_block')
         md.inlinePatterns.deregister('html')


 class RoffWalker(object):

     def __init__(self, tree, output=sys.stdout):
         self.tree = tree
         self.target = output
         self.f = StringIO()

     def walk(self):
         self._walk(self.tree, parent_tag=None)
         # We don't want to start lines with \. because that can confuse man
         # For lines that start with \., we need to prefix them with \& so it
         # knows not to treat that line as a directive
         data = re.sub(r'^\\\.', r'\&.', self.f.getvalue(), flags=re.MULTILINE)
         self.target.write(data)

     def _ul_is_special(self, root):
         if len(root) != 1:
             return False
         child = root[0]
         if child.tag != 'li':
             return False
         msg = ''.join(child.itertext()).strip()
         return msg.endswith(':')

     def _walk_child(self, root):
         if len(root) > 0:
             self._walk(root[0], parent_tag=root.tag)

     def _write_element(self, root, ensure_newline=True):
         if root.text is not None:
             text = self._sanitize(root.text)
             self.__write_raw(text)
         self._walk_child(root)
         self._write_tail(root, ensure_newline=ensure_newline)

     def _write_tail(self, root, ensure_newline=False, inline=False):
         if root.tail is not None:
             if inline or root.tail != '\n':
                 text = self._sanitize(root.tail)
                 if text.endswith('\n'):
                     ensure_newline = False
                 self.__write_raw(text)
         if ensure_newline:
             self.__write_raw('\n')

     def _walk(self, root, parent_tag=None):
         last_tag = None
         while root is not None:
             if root.tag == 'h1':
                 self.__write_cmd('.TH "JQ" "1" "{}" "" ""'.format(
                     date.today().strftime('%B %Y')))
                 self.__write_cmd('.SH "NAME"')
                 # TODO: properly parse this
                 self.__write_raw(r'\fBjq\fR \- Command\-line JSON processor' +
                                  "\n")

             elif root.tag == 'h2':
                 self.__write_cmd('.SH "{}"'.format(''.join(
                     root.itertext()).strip()))

             elif root.tag == 'h3':
                 text = ''.join(root.itertext()).strip()
                 self.__write_cmd('.SS "{}"'.format(self._h3_sanitize(text)))

             elif root.tag == 'p':
                 if last_tag not in ['h2', 'h3'] and parent_tag not in ['li']:
                     self.__write_cmd('.P')
                 self._write_element(root, ensure_newline=(parent_tag != 'li'))

             elif root.tag == 'ul':
                 if self._ul_is_special(root):
                     li = root[0]
                     self.__write_cmd('.TP')
                     self._write_element(li)
                     next = root.getnext()
                     while next is not None and next.tag == 'p':
                         if next.getnext() is not None and next.getnext(
                         ).tag == 'pre':
                             # we don't want to .IP these, because it'll look funny with the code indent
                             break
                         self.__write_cmd('.IP')
                         self._write_element(next)
                         root = next
                         next = root.getnext()
                 else:
                     self._walk_child(root)
                     self._write_tail(root)
                     # A pre tag after the end of a list doesn't want two of the indentation commands
                     if root.getnext() is None or root.getnext().tag != 'pre':
                         self.__write_cmd('.IP "" 0')

             elif root.tag == 'li':
                 self.__write_cmd(r'.IP "\(bu" 4')
                 if root.text is not None and root.text.strip() != '':
                     text = self._sanitize(root.text)
                     self.__write_raw(text)
                 self._walk_child(root)
                 self._write_tail(root, ensure_newline=True)

             elif root.tag == 'strong':
                 if root.text is not None:
                     text = self._sanitize(root.text)
                     self.__write_raw('\\fB{}\\fR'.format(text))

                 self._write_tail(root, inline=True)

             elif root.tag == 'em':
                 if root.text is not None:
                     text = self._sanitize(root.text)
                     self.__write_raw('\\fI{}\\fR'.format(text))
                 self._write_tail(root, inline=True)

             elif root.tag == 'code':
                 if root.text is not None:
                     text = self._code_sanitize(root.text)
                     self.__write_raw('\\fB{}\\fR'.format(text))
                 self._write_tail(root, inline=True)

             elif root.tag == 'pre':
                 self.__write_cmd('.IP "" 4')
                 self.__write_cmd('.nf\n')  # extra newline for spacing reasons
                 next = root
                 first = True
                 while next is not None and next.tag == 'pre':
                     if not first:
                         self.__write_raw('\n')
                     text = ''.join(next.itertext(with_tail=False))
                     self.__write_raw(self._pre_sanitize(text))
                     first = False
                     root = next
                     next = next.getnext()
                 self.__write_cmd('.fi')
                 self.__write_cmd('.IP "" 0')

             else:
                 self._walk_child(root)

             last_tag = root.tag
             root = root.getnext()

     def _base_sanitize(self, text):
         text = re.sub(r'\\', r'\\e', text)
         text = re.sub(r'\.', r'\\.', text)
         text = re.sub("'", r"\'", text)
         text = re.sub('-', r'\-', text)
         return text

     def _pre_sanitize(self, text):
         return self._base_sanitize(text)

     def _code_sanitize(self, text):
         text = self._base_sanitize(text)
         text = re.sub(r'\s', ' ', text)
         return text

     def _h3_sanitize(self, text):
         text = self._base_sanitize(text)
         text = re.sub(' \n|\n ', ' ', text)
         text = re.sub('\n', ' ', text)
         return text

     def _sanitize(self, text):
         text = self._base_sanitize(text)
         text = re.sub(r'<([^>]+)>', r'\\fI\1\\fR', text)
         text = re.sub(r' +', ' ', text)
         text = re.sub('\n', ' ', text)
         return text

     def __write_cmd(self, dat):
         print('.', dat, sep='\n', file=self.f)
         pass

     def __write_raw(self, dat):
         print(dat, sep='', end='', file=self.f)
         pass


 def load_yml_file(fn):
     with open(fn) as f:
         return yaml.safe_load(f)


 def dedent_body(body):
     lines = [re.sub(r'^  (\S)', r'\1', l) for l in body.split('\n')]
     return '\n'.join(lines)


 def convert_manual_to_markdown():
     f = StringIO()
     manual = load_yml_file("content/manual/manual.yml")
     f.write(manual.get('manpage_intro', '\n'))
     f.write(dedent_body(manual.get('body', '\n')))
     for section in manual.get('sections', []):
         f.write('## {}\n'.format(section.get('title', '').upper()))
         f.write(dedent_body(section.get('body', '\n')))
         f.write('\n')
         for entry in section.get('entries', []):
             f.write('### {}\n'.format(entry.get('title', '')))
             f.write(dedent_body(entry.get('body', '\n')))
             f.write('\n')
             if entry.get('examples') is not None:
                 f.write("~~~~\n")
                 first = True
                 for example in entry.get('examples'):
                     if not first:
                         f.write('\n')
                     f.write("jq '{}'\n".format(example.get('program', '')))
                     f.write("   {}\n".format(example.get('input', '')))
                     output = [str(x) for x in example.get('output', [])]
                     f.write("=> {}\n".format(', '.join(output)))
                     first = False
                 f.write("~~~~\n")
         f.write('\n')
     f.write(manual.get('manpage_epilogue', ''))
     return f.getvalue()


 # Convert manual.yml to our special markdown format
 markdown_data = convert_manual_to_markdown()

 # Convert markdown to html
 html_data = markdown.markdown(markdown_data,
                               extensions=[EscapeHtml(), 'fenced_code'])

 # Parse the html into a tree so we can walk it
 tr = etree.HTML(html_data, etree.HTMLParser())

 # Convert the markdown to ROFF
 RoffWalker(tr).walk()
	#!/usr/bin/env python3
	from datetime import date
	from io import StringIO
	from lxml import etree
	import markdown
	from markdown.extensions import Extension
	import re
	import sys
	import yaml


	# Prevent our markdown parser from trying to help by interpreting things in angle brackets as HTML tags.
	class EscapeHtml(Extension):

	def extendMarkdown(self, md):
	md.preprocessors.deregister('html_block')
	md.inlinePatterns.deregister('html')


	class RoffWalker(object):

	def __init__(self, tree, output=sys.stdout):
	self.tree = tree
	self.target = output
	self.f = StringIO()

	def walk(self):
	self._walk(self.tree, parent_tag=None)
	# We don't want to start lines with \. because that can confuse man
	# For lines that start with \., we need to prefix them with \& so it
	# knows not to treat that line as a directive
	data = re.sub(r'^\\\.', r'\&.', self.f.getvalue(), flags=re.MULTILINE)
	self.target.write(data)

	def _ul_is_special(self, root):
	if len(root) != 1:
	return False
	child = root[0]
	if child.tag != 'li':
	return False
	msg = ''.join(child.itertext()).strip()
	return msg.endswith(':')

	def _walk_child(self, root):
	if len(root) > 0:
	self._walk(root[0], parent_tag=root.tag)

	def _write_element(self, root, ensure_newline=True):
	if root.text is not None:
	text = self._sanitize(root.text)
	self.__write_raw(text)
	self._walk_child(root)
	self._write_tail(root, ensure_newline=ensure_newline)

	def _write_tail(self, root, ensure_newline=False, inline=False):
	if root.tail is not None:
	if inline or root.tail != '\n':
	text = self._sanitize(root.tail)
	if text.endswith('\n'):
	ensure_newline = False
	self.__write_raw(text)
	if ensure_newline:
	self.__write_raw('\n')

	def _walk(self, root, parent_tag=None):
	last_tag = None
	while root is not None:
	if root.tag == 'h1':
	self.__write_cmd('.TH "JQ" "1" "{}" "" ""'.format(
	date.today().strftime('%B %Y')))
	self.__write_cmd('.SH "NAME"')
	# TODO: properly parse this
	self.__write_raw(r'\fBjq\fR \- Command\-line JSON processor' +
	"\n")

	elif root.tag == 'h2':
	self.__write_cmd('.SH "{}"'.format(''.join(
	root.itertext()).strip()))

	elif root.tag == 'h3':
	text = ''.join(root.itertext()).strip()
	self.__write_cmd('.SS "{}"'.format(self._h3_sanitize(text)))

	elif root.tag == 'p':
	if last_tag not in ['h2', 'h3'] and parent_tag not in ['li']:
	self.__write_cmd('.P')
	self._write_element(root, ensure_newline=(parent_tag != 'li'))

	elif root.tag == 'ul':
	if self._ul_is_special(root):
	li = root[0]
	self.__write_cmd('.TP')
	self._write_element(li)
	next = root.getnext()
	while next is not None and next.tag == 'p':
	if next.getnext() is not None and next.getnext(
	).tag == 'pre':
	# we don't want to .IP these, because it'll look funny with the code indent
	break
	self.__write_cmd('.IP')
	self._write_element(next)
	root = next
	next = root.getnext()
	else:
	self._walk_child(root)
	self._write_tail(root)
	# A pre tag after the end of a list doesn't want two of the indentation commands
	if root.getnext() is None or root.getnext().tag != 'pre':
	self.__write_cmd('.IP "" 0')

	elif root.tag == 'li':
	self.__write_cmd(r'.IP "\(bu" 4')
	if root.text is not None and root.text.strip() != '':
	text = self._sanitize(root.text)
	self.__write_raw(text)
	self._walk_child(root)
	self._write_tail(root, ensure_newline=True)

	elif root.tag == 'strong':
	if root.text is not None:
	text = self._sanitize(root.text)
	self.__write_raw('\\fB{}\\fR'.format(text))

	self._write_tail(root, inline=True)

	elif root.tag == 'em':
	if root.text is not None:
	text = self._sanitize(root.text)
	self.__write_raw('\\fI{}\\fR'.format(text))
	self._write_tail(root, inline=True)

	elif root.tag == 'code':
	if root.text is not None:
	text = self._code_sanitize(root.text)
	self.__write_raw('\\fB{}\\fR'.format(text))
	self._write_tail(root, inline=True)

	elif root.tag == 'pre':
	self.__write_cmd('.IP "" 4')
	self.__write_cmd('.nf\n') # extra newline for spacing reasons
	next = root
	first = True
	while next is not None and next.tag == 'pre':
	if not first:
	self.__write_raw('\n')
	text = ''.join(next.itertext(with_tail=False))
	self.__write_raw(self._pre_sanitize(text))
	first = False
	root = next
	next = next.getnext()
	self.__write_cmd('.fi')
	self.__write_cmd('.IP "" 0')

	else:
	self._walk_child(root)

	last_tag = root.tag
	root = root.getnext()

	def _base_sanitize(self, text):
	text = re.sub(r'\\', r'\\e', text)
	text = re.sub(r'\.', r'\\.', text)
	text = re.sub("'", r"\'", text)
	text = re.sub('-', r'\-', text)
	return text

	def _pre_sanitize(self, text):
	return self._base_sanitize(text)

	def _code_sanitize(self, text):
	text = self._base_sanitize(text)
	text = re.sub(r'\s', ' ', text)
	return text

	def _h3_sanitize(self, text):
	text = self._base_sanitize(text)
	text = re.sub(' \n\|\n ', ' ', text)
	text = re.sub('\n', ' ', text)
	return text

	def _sanitize(self, text):
	text = self._base_sanitize(text)
	text = re.sub(r'<([^>]+)>', r'\\fI\1\\fR', text)
	text = re.sub(r' +', ' ', text)
	text = re.sub('\n', ' ', text)
	return text

	def __write_cmd(self, dat):
	print('.', dat, sep='\n', file=self.f)
	pass

	def __write_raw(self, dat):
	print(dat, sep='', end='', file=self.f)
	pass


	def load_yml_file(fn):
	with open(fn) as f:
	return yaml.safe_load(f)


	def dedent_body(body):
	lines = [re.sub(r'^ (\S)', r'\1', l) for l in body.split('\n')]
	return '\n'.join(lines)


	def convert_manual_to_markdown():
	f = StringIO()
	manual = load_yml_file("content/manual/manual.yml")
	f.write(manual.get('manpage_intro', '\n'))
	f.write(dedent_body(manual.get('body', '\n')))
	for section in manual.get('sections', []):
	f.write('## {}\n'.format(section.get('title', '').upper()))
	f.write(dedent_body(section.get('body', '\n')))
	f.write('\n')
	for entry in section.get('entries', []):
	f.write('### {}\n'.format(entry.get('title', '')))
	f.write(dedent_body(entry.get('body', '\n')))
	f.write('\n')
	if entry.get('examples') is not None:
	f.write("~~~~\n")
	first = True
	for example in entry.get('examples'):
	if not first:
	f.write('\n')
	f.write("jq '{}'\n".format(example.get('program', '')))
	f.write(" {}\n".format(example.get('input', '')))
	output = [str(x) for x in example.get('output', [])]
	f.write("=> {}\n".format(', '.join(output)))
	first = False
	f.write("~~~~\n")
	f.write('\n')
	f.write(manual.get('manpage_epilogue', ''))
	return f.getvalue()


	# Convert manual.yml to our special markdown format
	markdown_data = convert_manual_to_markdown()

	# Convert markdown to html
	html_data = markdown.markdown(markdown_data,
	extensions=[EscapeHtml(), 'fenced_code'])

	# Parse the html into a tree so we can walk it
	tr = etree.HTML(html_data, etree.HTMLParser())

	# Convert the markdown to ROFF
	RoffWalker(tr).walk()