172 lines
5.3 KiB
Python
Executable File
172 lines
5.3 KiB
Python
Executable File
#!/usr/bin/python
|
|
# Copyright 2003-2008, Nick Mathewson. See LICENSE for licensing info.
|
|
|
|
"""Download files in bibliography into a local cache.
|
|
"""
|
|
|
|
from __future__ import absolute_import
|
|
from __future__ import print_function
|
|
import os
|
|
import sys
|
|
import signal
|
|
import time
|
|
import gzip
|
|
|
|
from . import BibTeX
|
|
from . import config
|
|
import six.moves.urllib.request, six.moves.urllib.error, six.moves.urllib.parse
|
|
import getopt
|
|
import socket
|
|
import errno
|
|
import six.moves.http_client
|
|
|
|
FILE_TYPES = [ "txt", "html", "pdf", "ps", "ps.gz", "abstract" ]
|
|
BIN_FILE_TYPES = [ 'pdf', 'ps.gz' ]
|
|
|
|
class UIError(Exception):
|
|
pass
|
|
|
|
def tryUnlink(fn):
|
|
try:
|
|
os.unlink(fn)
|
|
except OSError:
|
|
pass
|
|
|
|
def getCacheFname(key, ftype, section):
|
|
return BibTeX.smartJoin(config.OUTPUT_DIR,config.CACHE_DIR,
|
|
section,
|
|
"%s.%s"%(key,ftype))
|
|
|
|
def downloadFile(key, ftype, section, url,timeout=None):
|
|
if timeout is None:
|
|
timeout = config.DOWNLOAD_CONNECT_TIMEOUT
|
|
fname = getCacheFname(key, ftype, section)
|
|
parent = os.path.split(fname)[0]
|
|
if not os.path.exists(parent):
|
|
os.makedirs(parent)
|
|
|
|
fnameTmp = fname+".tmp"
|
|
fnameURL = fname+".url"
|
|
tryUnlink(fnameTmp)
|
|
|
|
def sigalrmHandler(sig,_):
|
|
pass
|
|
signal.signal(signal.SIGALRM, sigalrmHandler)
|
|
signal.alarm(timeout)
|
|
try:
|
|
try:
|
|
infile = six.moves.urllib.request.urlopen(url)
|
|
except six.moves.http_client.InvalidURL as e:
|
|
raise UIError("Invalid URL %s: %s"%(url,e))
|
|
except IOError as e:
|
|
raise UIError("Cannot connect to url %s: %s"%(url,e))
|
|
except socket.error as e:
|
|
if getattr(e,"errno",-1) == errno.EINTR:
|
|
raise UIError("Connection timed out to url %s"%url)
|
|
else:
|
|
raise UIError("Error connecting to %s: %s"%(url, e))
|
|
finally:
|
|
signal.alarm(0)
|
|
|
|
mode = 'w'
|
|
if ftype in BIN_FILE_TYPES:
|
|
mode = 'wb'
|
|
outfile = open(fnameTmp, mode)
|
|
try:
|
|
while 1:
|
|
s = infile.read(1<<16)
|
|
if not s: break
|
|
outfile.write(s)
|
|
finally:
|
|
infile.close()
|
|
outfile.close()
|
|
|
|
urlfile = open(fnameURL, 'w')
|
|
print(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()), file=urlfile)
|
|
if "\n" in url: url = url.replace("\n", " ")
|
|
print(url, file=urlfile)
|
|
urlfile.close()
|
|
|
|
os.rename(fnameTmp, fname)
|
|
|
|
def getURLs(entry):
|
|
r = {}
|
|
for ftype in FILE_TYPES:
|
|
ftype2 = ftype.replace(".", "_")
|
|
url = entry.get("www_%s_url"%ftype2)
|
|
if url:
|
|
r[ftype] = url.strip().replace("\n", " ")
|
|
return r
|
|
|
|
def getCachedURL(key, ftype, section):
|
|
fname = getCacheFname(key, ftype, section)
|
|
urlFname = fname+".url"
|
|
if not os.path.exists(fname) or not os.path.exists(urlFname):
|
|
return None
|
|
f = open(urlFname, 'r')
|
|
lines = f.readlines()
|
|
f.close()
|
|
if len(lines) != 2:
|
|
print("ERROR: unexpected number of lines in", urlFname, file=sys.stderr)
|
|
return lines[1].strip()
|
|
|
|
def downloadAll(bibtex, missingOnly=0):
|
|
"""returns list of tuples of key, ftype, url, error"""
|
|
errors = []
|
|
for e in bibtex.entries:
|
|
urls = getURLs(e)
|
|
key = e.key
|
|
section = e.get("www_cache_section", ".")
|
|
for ftype, url in urls.items():
|
|
if missingOnly:
|
|
cachedURL = getCachedURL(key, ftype, section)
|
|
if cachedURL == url:
|
|
print("Skipping",url, file=sys.stderr)
|
|
continue
|
|
elif cachedURL is not None:
|
|
print("URL for %s.%s has changed"%(key,ftype), file=sys.stderr)
|
|
else:
|
|
print("I have no copy of %s.%s"%(key,ftype), file=sys.stderr)
|
|
try:
|
|
downloadFile(key, ftype, section, url)
|
|
print("Downloaded",url)
|
|
except UIError as e:
|
|
print(str(e), file=sys.stderr)
|
|
errors.append((key,ftype,url,str(e)))
|
|
except (IOError, socket.error) as e:
|
|
msg = "Error downloading %s: %s"%(url,str(e))
|
|
print(msg, file=sys.stderr)
|
|
errors.append((key,ftype,url,msg))
|
|
if "ps" in urls and "ps.gz" not in urls:
|
|
# Say, this is something we'd like to have gzipped locally.
|
|
psFname = getCacheFname(key, "ps", section)
|
|
psGzFname = getCacheFname(key, "ps.gz", section)
|
|
if os.path.exists(psFname) and not os.path.exists(psGzFname):
|
|
# This is something we haven't gzipped yet.
|
|
print("Compressing a copy of",psFname)
|
|
outf = gzip.GzipFile(psGzFname, "wb")
|
|
inf = open(psFname, "rb")
|
|
while 1:
|
|
s = inf.read(4096)
|
|
if not s:
|
|
break
|
|
outf.write(s)
|
|
outf.close()
|
|
inf.close()
|
|
|
|
return errors
|
|
|
|
if __name__ == '__main__':
|
|
if len(sys.argv) == 2:
|
|
print("Loading from %s"%sys.argv[1])
|
|
else:
|
|
print("Expected a single configuration file as an argument", file=sys.stderr)
|
|
sys.exit(1)
|
|
config.load(sys.argv[1])
|
|
|
|
if config.CACHE_UMASK != None:
|
|
os.umask(config.CACHE_UMASK)
|
|
|
|
bib = BibTeX.parseFile(config.MASTER_BIB)
|
|
downloadAll(bib,missingOnly=1)
|