blob: 582b8521da154423a3eacb52e434b171f9320977 [file] [log] [blame]
#!/usr/bin/env python
""" I honestly don't even know how the hell this works, just use it. """
__author__ = "Scott Stamp <scott@hypermine.com>"
from HTMLParser import HTMLParser
from urlparse import urljoin
from sys import setrecursionlimit
import re
import requests
setrecursionlimit(10000)
root = 'http://localhost:8000'
class DataHolder:
def __init__(self, value=None, attr_name='value'):
self._attr_name = attr_name
self.set(value)
def __call__(self, value):
return self.set(value)
def set(self, value):
setattr(self, self._attr_name, value)
return value
def get(self):
return getattr(self, self._attr_name)
class Parser(HTMLParser):
global root
ids = set()
crawled = set()
anchors = {}
pages = set()
save_match = DataHolder(attr_name='match')
def __init__(self, origin):
self.origin = origin
HTMLParser.__init__(self)
def handle_starttag(self, tag, attrs):
attrs = dict(attrs)
if 'href' in attrs:
href = attrs['href']
if re.match('^{0}|\/|\#[\S]{{1,}}'.format(root), href):
if self.save_match(re.search('.*\#(.*?)$', href)):
if self.origin not in self.anchors:
self.anchors[self.origin] = set()
self.anchors[self.origin].add(
self.save_match.match.groups(1)[0])
url = urljoin(root, href)
if url not in self.crawled and not re.match('^\#', href):
self.crawled.add(url)
Parser(url).feed(requests.get(url).content)
if 'id' in attrs:
self.ids.add(attrs['id'])
# explicit <a name=""></a> references
if 'name' in attrs:
self.ids.add(attrs['name'])
r = requests.get(root)
parser = Parser(root)
parser.feed(r.content)
for anchor in sorted(parser.anchors):
if not re.match('.*/\#.*', anchor):
for anchor_name in parser.anchors[anchor]:
if anchor_name not in parser.ids:
print 'Missing - ({0}): #{1}'.format(
anchor.replace(root, ''), anchor_name)