| #! /usr/bin/env python | |
| """A variant on webchecker that creates a mirror copy of a remote site.""" | |
| __version__ = "$Revision$" | |
| import os | |
| import sys | |
| import urllib | |
| import getopt | |
| import webchecker | |
| # Extract real version number if necessary | |
| if __version__[0] == '$': | |
| _v = __version__.split() | |
| if len(_v) == 3: | |
| __version__ = _v[1] | |
| def main(): | |
| verbose = webchecker.VERBOSE | |
| try: | |
| opts, args = getopt.getopt(sys.argv[1:], "qv") | |
| except getopt.error, msg: | |
| print msg | |
| print "usage:", sys.argv[0], "[-qv] ... [rooturl] ..." | |
| return 2 | |
| for o, a in opts: | |
| if o == "-q": | |
| verbose = 0 | |
| if o == "-v": | |
| verbose = verbose + 1 | |
| c = Sucker() | |
| c.setflags(verbose=verbose) | |
| c.urlopener.addheaders = [ | |
| ('User-agent', 'websucker/%s' % __version__), | |
| ] | |
| for arg in args: | |
| print "Adding root", arg | |
| c.addroot(arg) | |
| print "Run..." | |
| c.run() | |
| class Sucker(webchecker.Checker): | |
| checkext = 0 | |
| nonames = 1 | |
| # SAM 11/13/99: in general, URLs are now URL pairs. | |
| # Since we've suppressed name anchor checking, | |
| # we can ignore the second dimension. | |
| def readhtml(self, url_pair): | |
| url = url_pair[0] | |
| text = None | |
| path = self.savefilename(url) | |
| try: | |
| f = open(path, "rb") | |
| except IOError: | |
| f = self.openpage(url_pair) | |
| if f: | |
| info = f.info() | |
| nurl = f.geturl() | |
| if nurl != url: | |
| url = nurl | |
| path = self.savefilename(url) | |
| text = f.read() | |
| f.close() | |
| self.savefile(text, path) | |
| if not self.checkforhtml(info, url): | |
| text = None | |
| else: | |
| if self.checkforhtml({}, url): | |
| text = f.read() | |
| f.close() | |
| return text, url | |
| def savefile(self, text, path): | |
| dir, base = os.path.split(path) | |
| makedirs(dir) | |
| try: | |
| f = open(path, "wb") | |
| f.write(text) | |
| f.close() | |
| self.message("saved %s", path) | |
| except IOError, msg: | |
| self.message("didn't save %s: %s", path, str(msg)) | |
| def savefilename(self, url): | |
| type, rest = urllib.splittype(url) | |
| host, path = urllib.splithost(rest) | |
| path = path.lstrip("/") | |
| user, host = urllib.splituser(host) | |
| host, port = urllib.splitnport(host) | |
| host = host.lower() | |
| if not path or path[-1] == "/": | |
| path = path + "index.html" | |
| if os.sep != "/": | |
| path = os.sep.join(path.split("/")) | |
| path = os.path.join(host, path) | |
| return path | |
| def makedirs(dir): | |
| if not dir: | |
| return | |
| if os.path.exists(dir): | |
| if not os.path.isdir(dir): | |
| try: | |
| os.rename(dir, dir + ".bak") | |
| os.mkdir(dir) | |
| os.rename(dir + ".bak", os.path.join(dir, "index.html")) | |
| except os.error: | |
| pass | |
| return | |
| head, tail = os.path.split(dir) | |
| if not tail: | |
| print "Huh? Don't know how to make dir", dir | |
| return | |
| makedirs(head) | |
| os.mkdir(dir, 0777) | |
| if __name__ == '__main__': | |
| sys.exit(main() or 0) |