docs/docvalidate.py - third_party/github.com/moby/moby - Git at Google

 #!/usr/bin/env python

 """ I honestly don't even know how the hell this works, just use it. """
 __author__ = "Scott Stamp <scott@hypermine.com>"

 from HTMLParser import HTMLParser
 from urlparse import urljoin
 from sys import setrecursionlimit
 import re
 import requests

 setrecursionlimit(10000)
 root = 'http://localhost:8000'


 class DataHolder:

     def __init__(self, value=None, attr_name='value'):
         self._attr_name = attr_name
         self.set(value)

     def __call__(self, value):
         return self.set(value)

     def set(self, value):
         setattr(self, self._attr_name, value)
         return value

     def get(self):
         return getattr(self, self._attr_name)


 class Parser(HTMLParser):
     global root

     ids = set()
     crawled = set()
     anchors = {}
     pages = set()
     save_match = DataHolder(attr_name='match')

     def __init__(self, origin):
         self.origin = origin
         HTMLParser.__init__(self)

     def handle_starttag(self, tag, attrs):
         attrs = dict(attrs)
         if 'href' in attrs:
             href = attrs['href']

             if re.match('^{0}|\/|\#[\S]{{1,}}'.format(root), href):
                 if self.save_match(re.search('.*\#(.*?)$', href)):
                     if self.origin not in self.anchors:
                         self.anchors[self.origin] = set()
                     self.anchors[self.origin].add(
                         self.save_match.match.groups(1)[0])

                 url = urljoin(root, href)

                 if url not in self.crawled and not re.match('^\#', href):
                     self.crawled.add(url)
                     Parser(url).feed(requests.get(url).content)

         if 'id' in attrs:
             self.ids.add(attrs['id'])
 	# explicit <a name=""></a> references
         if 'name' in attrs:
             self.ids.add(attrs['name'])


 r = requests.get(root)
 parser = Parser(root)
 parser.feed(r.content)
 for anchor in sorted(parser.anchors):
     if not re.match('.*/\#.*', anchor):
         for anchor_name in parser.anchors[anchor]:
             if anchor_name not in parser.ids:
                 print 'Missing - ({0}): #{1}'.format(
                     anchor.replace(root, ''), anchor_name)
	#!/usr/bin/env python

	""" I honestly don't even know how the hell this works, just use it. """
	__author__ = "Scott Stamp <scott@hypermine.com>"

	from HTMLParser import HTMLParser
	from urlparse import urljoin
	from sys import setrecursionlimit
	import re
	import requests

	setrecursionlimit(10000)
	root = 'http://localhost:8000'


	class DataHolder:

	def __init__(self, value=None, attr_name='value'):
	self._attr_name = attr_name
	self.set(value)

	def __call__(self, value):
	return self.set(value)

	def set(self, value):
	setattr(self, self._attr_name, value)
	return value

	def get(self):
	return getattr(self, self._attr_name)


	class Parser(HTMLParser):
	global root

	ids = set()
	crawled = set()
	anchors = {}
	pages = set()
	save_match = DataHolder(attr_name='match')

	def __init__(self, origin):
	self.origin = origin
	HTMLParser.__init__(self)

	def handle_starttag(self, tag, attrs):
	attrs = dict(attrs)
	if 'href' in attrs:
	href = attrs['href']

	if re.match('^{0}\|\/\|\#[\S]{{1,}}'.format(root), href):
	if self.save_match(re.search('.\#(.?)$', href)):
	if self.origin not in self.anchors:
	self.anchors[self.origin] = set()
	self.anchors[self.origin].add(
	self.save_match.match.groups(1)[0])

	url = urljoin(root, href)

	if url not in self.crawled and not re.match('^\#', href):
	self.crawled.add(url)
	Parser(url).feed(requests.get(url).content)

	if 'id' in attrs:
	self.ids.add(attrs['id'])
	# explicit <a name=""></a> references
	if 'name' in attrs:
	self.ids.add(attrs['name'])


	r = requests.get(root)
	parser = Parser(root)
	parser.feed(r.content)
	for anchor in sorted(parser.anchors):
	if not re.match('./\#.', anchor):
	for anchor_name in parser.anchors[anchor]:
	if anchor_name not in parser.ids:
	print 'Missing - ({0}): #{1}'.format(
	anchor.replace(root, ''), anchor_name)