#!/usr/local/bin/python
"""
  NAME

    dlc.py - Dead Link Checker

  SYNOPSIS

    python dlc.py {-u url|-f file} [-v]

      -u   check links on url provided.
      -f   check links in file name provided.  Only external links will
           be checked.
      -v   verbose

  DESCRIPTION

    dlc.py walks the HTML pages of a web-site (as given by url),
    reporting which links are broken.  External links are checked for
    validity, but not followed further.

    If a file name is provided, via the -f option, only external links
    are checked, since the addresses on internal links cannot be
    deduced.

  MODIFICATION HISTORY
  Mnemonic  Rel   Date   Who
  dlc       1.0   050901 mpw
    Written.

$Id: dlc.py,v 1.1.1.1 2005/09/22 19:14:47 mark Exp $    
"""

import HTMLParser
import httplib
import urlparse
import sys
import getopt

class LinkParser(HTMLParser.HTMLParser):
    def __init__(self):
        HTMLParser.HTMLParser.__init__(self)
        self.pageaddr = ""              # address of page being processed
    def handle_starttag(self,tag,attrs):
        if tag != "a": return
        for name,value in attrs:
            if name == "href":
                check_page(self.pageaddr,value)

def smatch(s,ls):
    for k in ls:
        if s == k: return True
    return False

def check_page(curaddr,link):
    global root_url
    global vlist
    global flist
    global verbose
    
    if smatch(link,vlist): return
    vlist.append(link)
    if link.find("http:") >= 0:
        follow = link==root_url
    elif link.find("ftp:") >= 0:
        # TBD handle FTP links
        print >>sys.stderr,"\nWarning: %s not checked." %(link,)
        return
    elif link.find("mailto:") >= 0:
        # print "mailto - ",link
        return
    else:
        # local link; if curaddr is empty, must be checking a file
        # therefore no local links are checked.
        if curaddr == "":
            vlist.pop()
            return
        link = urlparse.urljoin(curaddr,link)
        follow = True

    url_list = urlparse.urlsplit(link)
    try:
        if verbose:
            print "Checking",link,
        else:
            sys.stdout.write(".")
        sys.stdout.flush()
            
        h = httplib.HTTPConnection(url_list[1])
        h.request("GET",url_list[2])
        resp = h.getresponse()
        failed = resp.status >= 400
    except:
        failed = True

    if failed:
        if verbose: print " *FAILED*"
        flist.append("%s: %s" % (curaddr,link))
        return

    if verbose: print " (OK)"
    if follow and resp.msg.getsubtype() == "html":
            p = LinkParser()
            p.pageaddr = link
            p.feed(resp.read())
            p.close()

if len(sys.argv) == 1:
    print >>sys.stderr, "%s: need url as argument.\n" % (sys.argv[0],)
    sys.exit(0)

url = True
verbose = False
vlist = list()
flist = list()

try:
    opts,args = getopt.getopt(sys.argv[1:],'u:f:v')
    for o,v in opts:
        if o == '-u':
            root_url = v
        elif o == '-f':
            root_url = v
            url = False
        elif o == '-v':
            verbose = True
except getopt.GetoptError,e:
    print "%s: illegal argument: %s" % (sys.argv[0],e.opt)
    sys.exit(1)

print "Dead Link Checker running ",
if verbose: print

if url:
    check_page(root_url,"")
else:
    p = LinkParser()
    #p.pageaddr = root_url
    p.feed(open(root_url).read())
    p.close()

print "\n\n%d links checked; %d failed." % (len(vlist),len(flist))

if len(flist) != 0:
    print "\nThe following links failed:"
    for v in flist:
        print "%s" % (v,)
    
