i2p.www/i2p2www/anonbib/updateCache.py

#!/usr/bin/python
# Copyright 2003-2008, Nick Mathewson.  See LICENSE for licensing info.

"""Download files in bibliography into a local cache.
"""

import os
import sys
import signal
import time
import gzip

import BibTeX
import config
import urllib2
import getopt
import socket
import errno
import httplib

FILE_TYPES = [ "txt", "html", "pdf", "ps", "ps.gz", "abstract" ]
BIN_FILE_TYPES = [ 'pdf', 'ps.gz' ]

class UIError(Exception):
    pass

def tryUnlink(fn):
    try:
        os.unlink(fn)
    except OSError:
        pass

def getCacheFname(key, ftype, section):
    return BibTeX.smartJoin(config.OUTPUT_DIR,config.CACHE_DIR,
                            section,
                            "%s.%s"%(key,ftype))

def downloadFile(key, ftype, section, url,timeout=None):
    if timeout is None:
        timeout = config.DOWNLOAD_CONNECT_TIMEOUT
    fname = getCacheFname(key, ftype, section)
    parent = os.path.split(fname)[0]
    if not os.path.exists(parent):
        os.makedirs(parent)

    fnameTmp = fname+".tmp"
    fnameURL = fname+".url"
    tryUnlink(fnameTmp)

    def sigalrmHandler(sig,_):
        pass
    signal.signal(signal.SIGALRM, sigalrmHandler)
    signal.alarm(timeout)
    try:
        try:
            infile = urllib2.urlopen(url)
        except httplib.InvalidURL, e:
            raise UIError("Invalid URL %s: %s"%(url,e))
        except IOError, e:
            raise UIError("Cannot connect to url %s: %s"%(url,e))
        except socket.error, e:
            if getattr(e,"errno",-1) == errno.EINTR:
                raise UIError("Connection timed out to url %s"%url)
            else:
                raise UIError("Error connecting to %s: %s"%(url, e))
    finally:
        signal.alarm(0)

    mode = 'w'
    if ftype in BIN_FILE_TYPES:
        mode = 'wb'
    outfile = open(fnameTmp, mode)
    try:
        while 1:
            s = infile.read(1<<16)
            if not s: break
            outfile.write(s)
    finally:
        infile.close()
        outfile.close()

    urlfile = open(fnameURL, 'w')
    print >>urlfile, time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
    if "\n" in url: url = url.replace("\n", " ")
    print >>urlfile, url
    urlfile.close()

    os.rename(fnameTmp, fname)

def getURLs(entry):
    r = {}
    for ftype in FILE_TYPES:
        ftype2 = ftype.replace(".", "_")
        url = entry.get("www_%s_url"%ftype2)
        if url:
            r[ftype] = url.strip().replace("\n", " ")
    return r

def getCachedURL(key, ftype, section):
    fname = getCacheFname(key, ftype, section)
    urlFname = fname+".url"
    if not os.path.exists(fname) or not os.path.exists(urlFname):
        return None
    f = open(urlFname, 'r')
    lines = f.readlines()
    f.close()
    if len(lines) != 2:
        print >>sys.stderr, "ERROR: unexpected number of lines in", urlFname
    return lines[1].strip()

def downloadAll(bibtex, missingOnly=0):
    """returns list of tuples of key, ftype, url, error"""
    errors = []
    for e in bibtex.entries:
        urls = getURLs(e)
        key = e.key
        section = e.get("www_cache_section", ".")
        for ftype, url in urls.items():
            if missingOnly:
                cachedURL = getCachedURL(key, ftype, section)
                if cachedURL == url:
                    print >>sys.stderr,"Skipping",url
                    continue
                elif cachedURL is not None:
                    print >>sys.stderr,"URL for %s.%s has changed"%(key,ftype)
                else:
                    print >>sys.stderr,"I have no copy of %s.%s"%(key,ftype)
            try:
                downloadFile(key, ftype, section, url)
                print "Downloaded",url
            except UIError, e:
                print >>sys.stderr, str(e)
                errors.append((key,ftype,url,str(e)))
            except (IOError, socket.error), e:
                msg = "Error downloading %s: %s"%(url,str(e))
                print >>sys.stderr, msg
                errors.append((key,ftype,url,msg))
        if urls.has_key("ps") and not urls.has_key("ps.gz"):
            # Say, this is something we'd like to have gzipped locally.
            psFname = getCacheFname(key, "ps", section)
            psGzFname = getCacheFname(key, "ps.gz", section)
            if os.path.exists(psFname) and not os.path.exists(psGzFname):
                # This is something we haven't gzipped yet.
                print "Compressing a copy of",psFname
                outf = gzip.GzipFile(psGzFname, "wb")
                inf = open(psFname, "rb")
                while 1:
                    s = inf.read(4096)
                    if not s:
                        break
                    outf.write(s)
                outf.close()
                inf.close()

    return errors

if __name__ == '__main__':
    if len(sys.argv) == 2:
        print "Loading from %s"%sys.argv[1]
    else:
        print >>sys.stderr, "Expected a single configuration file as an argument"
        sys.exit(1)
    config.load(sys.argv[1])

    if config.CACHE_UMASK != None:
        os.umask(config.CACHE_UMASK)

    bib = BibTeX.parseFile(config.MASTER_BIB)
    downloadAll(bib,missingOnly=1)
Added anonbib for use in papers list Source: https://gitweb.torproject.org/anonbib.git Commit: b478fc493d4be2115185d94e077bf06196495417 2013-08-11 11:14:00 +00:00			`#!/usr/bin/python`
			`# Copyright 2003-2008, Nick Mathewson. See LICENSE for licensing info.`

			`"""Download files in bibliography into a local cache.`
			`"""`

			`import os`
			`import sys`
			`import signal`
			`import time`
			`import gzip`

			`import BibTeX`
			`import config`
			`import urllib2`
			`import getopt`
			`import socket`
			`import errno`
			`import httplib`

			`FILE_TYPES = [ "txt", "html", "pdf", "ps", "ps.gz", "abstract" ]`
			`BIN_FILE_TYPES = [ 'pdf', 'ps.gz' ]`

			`class UIError(Exception):`
			`pass`

			`def tryUnlink(fn):`
			`try:`
			`os.unlink(fn)`
			`except OSError:`
			`pass`

			`def getCacheFname(key, ftype, section):`
			`return BibTeX.smartJoin(config.OUTPUT_DIR,config.CACHE_DIR,`
			`section,`
			`"%s.%s"%(key,ftype))`

			`def downloadFile(key, ftype, section, url,timeout=None):`
			`if timeout is None:`
			`timeout = config.DOWNLOAD_CONNECT_TIMEOUT`
			`fname = getCacheFname(key, ftype, section)`
			`parent = os.path.split(fname)[0]`
			`if not os.path.exists(parent):`
			`os.makedirs(parent)`

			`fnameTmp = fname+".tmp"`
			`fnameURL = fname+".url"`
			`tryUnlink(fnameTmp)`

			`def sigalrmHandler(sig,_):`
			`pass`
			`signal.signal(signal.SIGALRM, sigalrmHandler)`
			`signal.alarm(timeout)`
			`try:`
			`try:`
			`infile = urllib2.urlopen(url)`
			`except httplib.InvalidURL, e:`
			`raise UIError("Invalid URL %s: %s"%(url,e))`
			`except IOError, e:`
			`raise UIError("Cannot connect to url %s: %s"%(url,e))`
			`except socket.error, e:`
			`if getattr(e,"errno",-1) == errno.EINTR:`
			`raise UIError("Connection timed out to url %s"%url)`
			`else:`
			`raise UIError("Error connecting to %s: %s"%(url, e))`
			`finally:`
			`signal.alarm(0)`

			`mode = 'w'`
			`if ftype in BIN_FILE_TYPES:`
			`mode = 'wb'`
			`outfile = open(fnameTmp, mode)`
			`try:`
			`while 1:`
			`s = infile.read(1<<16)`
			`if not s: break`
			`outfile.write(s)`
			`finally:`
			`infile.close()`
			`outfile.close()`

			`urlfile = open(fnameURL, 'w')`
			`print >>urlfile, time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())`
			`if "\n" in url: url = url.replace("\n", " ")`
			`print >>urlfile, url`
			`urlfile.close()`

			`os.rename(fnameTmp, fname)`

			`def getURLs(entry):`
			`r = {}`
			`for ftype in FILE_TYPES:`
			`ftype2 = ftype.replace(".", "_")`
			`url = entry.get("www_%s_url"%ftype2)`
			`if url:`
			`r[ftype] = url.strip().replace("\n", " ")`
			`return r`

			`def getCachedURL(key, ftype, section):`
			`fname = getCacheFname(key, ftype, section)`
			`urlFname = fname+".url"`
			`if not os.path.exists(fname) or not os.path.exists(urlFname):`
			`return None`
			`f = open(urlFname, 'r')`
			`lines = f.readlines()`
			`f.close()`
			`if len(lines) != 2:`
			`print >>sys.stderr, "ERROR: unexpected number of lines in", urlFname`
			`return lines[1].strip()`

			`def downloadAll(bibtex, missingOnly=0):`
			`"""returns list of tuples of key, ftype, url, error"""`
			`errors = []`
			`for e in bibtex.entries:`
			`urls = getURLs(e)`
			`key = e.key`
			`section = e.get("www_cache_section", ".")`
			`for ftype, url in urls.items():`
			`if missingOnly:`
			`cachedURL = getCachedURL(key, ftype, section)`
			`if cachedURL == url:`
			`print >>sys.stderr,"Skipping",url`
			`continue`
			`elif cachedURL is not None:`
			`print >>sys.stderr,"URL for %s.%s has changed"%(key,ftype)`
			`else:`
			`print >>sys.stderr,"I have no copy of %s.%s"%(key,ftype)`
			`try:`
			`downloadFile(key, ftype, section, url)`
			`print "Downloaded",url`
			`except UIError, e:`
			`print >>sys.stderr, str(e)`
			`errors.append((key,ftype,url,str(e)))`
			`except (IOError, socket.error), e:`
			`msg = "Error downloading %s: %s"%(url,str(e))`
			`print >>sys.stderr, msg`
			`errors.append((key,ftype,url,msg))`
			`if urls.has_key("ps") and not urls.has_key("ps.gz"):`
			`# Say, this is something we'd like to have gzipped locally.`
			`psFname = getCacheFname(key, "ps", section)`
			`psGzFname = getCacheFname(key, "ps.gz", section)`
			`if os.path.exists(psFname) and not os.path.exists(psGzFname):`
			`# This is something we haven't gzipped yet.`
			`print "Compressing a copy of",psFname`
			`outf = gzip.GzipFile(psGzFname, "wb")`
			`inf = open(psFname, "rb")`
			`while 1:`
			`s = inf.read(4096)`
			`if not s:`
			`break`
			`outf.write(s)`
			`outf.close()`
			`inf.close()`

			`return errors`

			`if __name__ == '__main__':`
			`if len(sys.argv) == 2:`
			`print "Loading from %s"%sys.argv[1]`
			`else:`
			`print >>sys.stderr, "Expected a single configuration file as an argument"`
			`sys.exit(1)`
			`config.load(sys.argv[1])`

			`if config.CACHE_UMASK != None:`
			`os.umask(config.CACHE_UMASK)`

			`bib = BibTeX.parseFile(config.MASTER_BIB)`
			`downloadAll(bib,missingOnly=1)`