#!/usr/local/bin/python
"""
  NAME

    dlc.py - Dead Link Checker

  SYNOPSIS

    python dlc.py [-v] [-i] [-a error_list] {url|file}

      -v   verbose
      -i   check internal links only
      -a   http error codes to accept as OK, defined in comma separated list
      
  DESCRIPTION

    dlc.py walks the HTML pages of a web-site (as given by url),
    reporting which links are broken.  External links are checked for
    validity, but not followed further.

    If a file name is provided, only external links are checked, since
    the addresses on internal links cannot be deduced.

    A failure is assumed to be a returned http error code of 400 or
    greater, with the exception of 403, since some pages forbid access
    by robot access such as manifested by this link checker.  You can
    suppress this behaviour by passing an empty list via the -a
    argument, i.e. -a ''.

  MODIFICATION HISTORY
  Mnemonic  Date   Who
  dlc       050901 mpw
    Written.
  dlc       060102 mpw
    Added -i option
  dlc       071108 mpw
    http error 403 counts as OK (assume forbidden because of robot access)
    Added -a option
  dlc       071111 mpw
    Improved command line argument handling.  
  dlc       071112 mpw
    Output http response codes when verbose
    
$Id: dlc.py,v 1.1 2014/04/19 13:57:30 mark Exp $    
"""

import HTMLParser
import httplib
import urlparse
import sys
import getopt

class LinkParser(HTMLParser.HTMLParser):
    def __init__(self):
        HTMLParser.HTMLParser.__init__(self)
        self.pageaddr = ""              # address of page being processed
    def handle_starttag(self,tag,attrs):
        if tag != "a": return
        for name,value in attrs:
            if name == "href":
                check_page(self.pageaddr,value)

def check_page(curaddr,link):
    global root_url
    global vlist
    global flist
    global verbose
    global check_external
    global not_error

    # only visit the first occurance of a link
    if link in vlist: return
    vlist.append(link)

    follow = True
    if link.startswith("http:") or link.startswith("https:"):
        follow = link==root_url
    elif link.startswith("ftp:"):
        # TBD handle FTP links
        print >>sys.stderr,"\nWarning: %s not checked." %(link,)
        return
    elif link.startswith("mailto:"):
        # print "mailto - ",link
        return
    else:
        # local link; if curaddr is empty, must be checking a file
        # therefore no local links are checked.
        if curaddr == "":
            vlist.pop()
            return
        link = urlparse.urljoin(curaddr,link)
        follow = True
        
    url_list = urlparse.urlsplit(link)
    if check_external or follow:
        resp = None
        error_code = 0
        try:
            if verbose:
                print "Checking",link,
            else:
                sys.stdout.write(".")
            sys.stdout.flush()

            h = httplib.HTTPConnection(url_list[1])
            if url_list[3] == '':
                page_addr = url_list[2]
            else:
                page_addr = url_list[2]+"?"+url_list[3]
            h.request("GET",page_addr,None,\
                      {'From':'www@hydrus.org.uk',\
                       'User-Agent':'Hydrus Dead Link Checker 1.0'})
            resp = h.getresponse()
            failed = resp.status >= 400 and not (resp.status in error_ok)
        except:
            failed = True

        if resp: error_code = resp.status
        if failed:
            if verbose: print " *FAILED* [%d]" % (error_code,)
            flist.append("%s: %s [%d]" % (curaddr,link,error_code))
            return

        if verbose: print " (OK) [%d]" % (error_code,)
        if follow and resp.msg.getsubtype() == "html":
                p = LinkParser()
                p.pageaddr = link
                p.feed(resp.read())
                p.close()
    else:
        if verbose: print "Skipping external link",link

#####################################################################
#####################################################################

verbose = False
check_external = True
vlist = list()
flist = list()
error_ok = (403,)

try:
    opts,args = getopt.getopt(sys.argv[1:],'a:vi')
    for o,v in opts:
        if o == '-v':
            verbose = True
        elif o == "-i":
            check_external = False
        elif o == '-a':
            error_ok = v.split(',')
            error_ok = [int(x) for x in error_ok]
except getopt.GetoptError,e:
    print "%s: illegal argument: %s" % (sys.argv[0],e.opt)
    sys.exit(1)

if len(args) == 0:
    print "%s: missing url or filename" % (sys.argv[0],)
    sys.exit(1)

root_url = sys.argv.pop()
url = root_url.startswith("http")

print "Dead Link Checker running ",
if verbose: print

if url:
    check_page(root_url,"")
else:
    p = LinkParser()
    p.feed(open(root_url).read())
    p.close()

print "\n\n%d links checked; %d failed." % (len(vlist),len(flist))

if len(flist) != 0:
    print "\nThe following links failed:"
    for v in flist:
        print "%s" % (v,)
    
