import htmllib, formatter,re,string import urllib, htmllib, sys class LinksExtractor(htmllib.HTMLParser): # derive new HTML parser def __init__(self, formatter) : # class constructor htmllib.HTMLParser.__init__(self, formatter) # base class constructor self.links = [] # create an empty list for storing hyperlinks def start_a(self, attrs) : # override handler of ... tags # process the attributes if len(attrs) > 0 : for attr in attrs : if attr[0] == "href" : # ignore all non HREF attributes self.links.append(attr[1]) # save the link info in the list def get_links(self) : # return the list of extracted links return self.links format = formatter.NullFormatter() # create default formatter htmlparser = LinksExtractor(format) # create new parser object if len(sys.argv) < 2: print "ERROR: Provide linklist filename" sys.exit(0) if len(sys.argv) == 2: sys.argv = sys.argv + [''] allLinks = [] linklist = open(sys.argv[1],'r') while True: link = linklist.readline() if not link: break print "now processing: " + link.rstrip() data = urllib.urlopen(link.rstrip()) #if sys.argv[2] == 'see': # while True: # sent = data.readline() # if 'a name="See_also"' in sent: # htmlparser.feed(data.read()) # parse the file saving the info about links # break # if '' in sent: # print "empty link: " + link.rstrip() # break #else: htmlparser.feed(data.read()) # parse the file saving the info about links allLinks = htmlparser.get_links() # get the hyperlinks list print len(allLinks) cleanLink= {} for link in allLinks: if not ':' in link: if not '#' in link: if link in cleanLink: cleanLink[link] = cleanLink[link] + 1 else: cleanLink[link] = 1 print len(cleanLink) sortedDict = sorted(cleanLink.iteritems(), key=lambda (k,v):(v,k), reverse=True) for link in sortedDict: print link