#!/usr/bin/python # Copyright 2003-2008, Nick Mathewson. See LICENSE for licensing info. """Download files in bibliography into a local cache. """ import os import sys import signal import time import gzip import BibTeX import config import urllib2 import getopt import socket import errno import httplib FILE_TYPES = [ "txt", "html", "pdf", "ps", "ps.gz", "abstract" ] BIN_FILE_TYPES = [ 'pdf', 'ps.gz' ] class UIError(Exception): pass def tryUnlink(fn): try: os.unlink(fn) except OSError: pass def getCacheFname(key, ftype, section): return BibTeX.smartJoin(config.OUTPUT_DIR,config.CACHE_DIR, section, "%s.%s"%(key,ftype)) def downloadFile(key, ftype, section, url,timeout=None): if timeout is None: timeout = config.DOWNLOAD_CONNECT_TIMEOUT fname = getCacheFname(key, ftype, section) parent = os.path.split(fname)[0] if not os.path.exists(parent): os.makedirs(parent) fnameTmp = fname+".tmp" fnameURL = fname+".url" tryUnlink(fnameTmp) def sigalrmHandler(sig,_): pass signal.signal(signal.SIGALRM, sigalrmHandler) signal.alarm(timeout) try: try: infile = urllib2.urlopen(url) except httplib.InvalidURL, e: raise UIError("Invalid URL %s: %s"%(url,e)) except IOError, e: raise UIError("Cannot connect to url %s: %s"%(url,e)) except socket.error, e: if getattr(e,"errno",-1) == errno.EINTR: raise UIError("Connection timed out to url %s"%url) else: raise UIError("Error connecting to %s: %s"%(url, e)) finally: signal.alarm(0) mode = 'w' if ftype in BIN_FILE_TYPES: mode = 'wb' outfile = open(fnameTmp, mode) try: while 1: s = infile.read(1<<16) if not s: break outfile.write(s) finally: infile.close() outfile.close() urlfile = open(fnameURL, 'w') print >>urlfile, time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) if "\n" in url: url = url.replace("\n", " ") print >>urlfile, url urlfile.close() os.rename(fnameTmp, fname) def getURLs(entry): r = {} for ftype in FILE_TYPES: ftype2 = ftype.replace(".", "_") url = entry.get("www_%s_url"%ftype2) if url: r[ftype] = url.strip().replace("\n", " ") return r def getCachedURL(key, ftype, section): fname = getCacheFname(key, ftype, section) urlFname = fname+".url" if not os.path.exists(fname) or not os.path.exists(urlFname): return None f = open(urlFname, 'r') lines = f.readlines() f.close() if len(lines) != 2: print >>sys.stderr, "ERROR: unexpected number of lines in", urlFname return lines[1].strip() def downloadAll(bibtex, missingOnly=0): """returns list of tuples of key, ftype, url, error""" errors = [] for e in bibtex.entries: urls = getURLs(e) key = e.key section = e.get("www_cache_section", ".") for ftype, url in urls.items(): if missingOnly: cachedURL = getCachedURL(key, ftype, section) if cachedURL == url: print >>sys.stderr,"Skipping",url continue elif cachedURL is not None: print >>sys.stderr,"URL for %s.%s has changed"%(key,ftype) else: print >>sys.stderr,"I have no copy of %s.%s"%(key,ftype) try: downloadFile(key, ftype, section, url) print "Downloaded",url except UIError, e: print >>sys.stderr, str(e) errors.append((key,ftype,url,str(e))) except (IOError, socket.error), e: msg = "Error downloading %s: %s"%(url,str(e)) print >>sys.stderr, msg errors.append((key,ftype,url,msg)) if urls.has_key("ps") and not urls.has_key("ps.gz"): # Say, this is something we'd like to have gzipped locally. psFname = getCacheFname(key, "ps", section) psGzFname = getCacheFname(key, "ps.gz", section) if os.path.exists(psFname) and not os.path.exists(psGzFname): # This is something we haven't gzipped yet. print "Compressing a copy of",psFname outf = gzip.GzipFile(psGzFname, "wb") inf = open(psFname, "rb") while 1: s = inf.read(4096) if not s: break outf.write(s) outf.close() inf.close() return errors if __name__ == '__main__': if len(sys.argv) == 2: print "Loading from %s"%sys.argv[1] else: print >>sys.stderr, "Expected a single configuration file as an argument" sys.exit(1) config.load(sys.argv[1]) if config.CACHE_UMASK != None: os.umask(config.CACHE_UMASK) bib = BibTeX.parseFile(config.MASTER_BIB) downloadAll(bib,missingOnly=1)