203 lines
6.0 KiB
Python
203 lines
6.0 KiB
Python
![]() |
# Make rankings of papers and authors for automatic classification of content hotness
|
||
|
|
||
|
# Google Scholar address
|
||
|
# http://scholar.google.com/scholar?as_epq=
|
||
|
|
||
|
# Take care of the caching setup
|
||
|
cache_expire = 60*60*24*30 # 30 days
|
||
|
|
||
|
# Checks
|
||
|
import config
|
||
|
import os
|
||
|
import sys
|
||
|
from os.path import exists, isdir, join, getmtime
|
||
|
from os import listdir, remove
|
||
|
|
||
|
def remove_old():
|
||
|
# Remove all old cached files
|
||
|
filenames = listdir(cache_folder())
|
||
|
from time import time
|
||
|
now = time()
|
||
|
for f in filenames:
|
||
|
pf = join(cache_folder(), f)
|
||
|
time_mt = getmtime(pf)
|
||
|
if now - time_mt > cache_expire: # 30 days
|
||
|
remove(pf)
|
||
|
|
||
|
def cache_folder():
|
||
|
r = join(config.OUTPUT_DIR, config.CITE_CACHE_DIR)
|
||
|
if not exists(r):
|
||
|
os.makedirs(r)
|
||
|
assert isdir(r)
|
||
|
return r
|
||
|
|
||
|
import re
|
||
|
from urllib2 import urlopen, build_opener
|
||
|
from urllib import quote
|
||
|
from datetime import date
|
||
|
import hashlib
|
||
|
|
||
|
# A more handy hash
|
||
|
def md5h(s):
|
||
|
m = hashlib.md5()
|
||
|
m.update(s)
|
||
|
return m.hexdigest()
|
||
|
|
||
|
format_tested = 0
|
||
|
|
||
|
def getPageForTitle(title, cache=True, update=True, save=True):
|
||
|
#Returns (citation-count, scholar url) tuple, or (None,None)
|
||
|
global format_tested
|
||
|
if not format_tested and update:
|
||
|
format_tested = 1
|
||
|
TestScholarFormat()
|
||
|
|
||
|
# Do not assume that the title is clean
|
||
|
title = re.sub("\s+", " ", title)
|
||
|
title = re.sub("[^'a-zA-Z0-9\. \-\/:]", "", title)
|
||
|
title = re.sub("'\/", " ", title)
|
||
|
|
||
|
# We rely on google scholar to return the article with this exact title
|
||
|
gurl = "http://scholar.google.com/scholar?as_q=&as_epq=%s&as_occt=title"
|
||
|
|
||
|
url = gurl % quote(title)
|
||
|
|
||
|
# Access cache or network
|
||
|
if exists(join(cache_folder(), md5h(url))) and cache:
|
||
|
return url, file(join(cache_folder(), md5h(url)),'r').read()
|
||
|
elif update:
|
||
|
print "Downloading rank for %r."%title
|
||
|
|
||
|
# Make a custom user agent (so that we are not filtered by Google)!
|
||
|
opener = build_opener()
|
||
|
opener.addheaders = [('User-agent', 'Anon.Bib.0.1')]
|
||
|
|
||
|
print "connecting..."
|
||
|
connection = opener.open(url)
|
||
|
print "reading"
|
||
|
page = connection.read()
|
||
|
print "done"
|
||
|
if save:
|
||
|
file(join(cache_folder(), md5h(url)),'w').write(page)
|
||
|
return url, page
|
||
|
else:
|
||
|
return url, None
|
||
|
|
||
|
def getCite(title, cache=True, update=True, save=True):
|
||
|
url, page = getPageForTitle(title, cache=cache, update=update, save=save)
|
||
|
if not page:
|
||
|
return None,None
|
||
|
|
||
|
# Check if it finds any articles
|
||
|
if len(re.findall("did not match any articles", page)) > 0:
|
||
|
return (None, None)
|
||
|
|
||
|
# Kill all tags!
|
||
|
cpage = re.sub("<[^>]*>", "", page)
|
||
|
|
||
|
# Add up all citations
|
||
|
s = sum([int(x) for x in re.findall("Cited by ([0-9]*)", cpage)])
|
||
|
return (s, url)
|
||
|
|
||
|
def getPaperURLs(title, cache=True, update=True, save=True):
|
||
|
url, page = getPageForTitle(title, cache=cache, update=update, save=save)
|
||
|
if not page:
|
||
|
return []
|
||
|
pages = re.findall(r'\&\#x25ba\;.*class=fl href="([^"]*)"', page)
|
||
|
return pages
|
||
|
|
||
|
def get_rank_html(title, years=None, base_url=".", update=True,
|
||
|
velocity=False):
|
||
|
s,url = getCite(title, update=update)
|
||
|
|
||
|
# Paper cannot be found
|
||
|
if s is None:
|
||
|
return ''
|
||
|
|
||
|
html = ''
|
||
|
|
||
|
url = url.replace("&","&")
|
||
|
|
||
|
# Hotness
|
||
|
H,h = 50,5
|
||
|
if s >= H:
|
||
|
html += '<a href="%s"><img src="%s/gold.gif" alt="More than %s citations on Google Scholar" title="More than %s citations on Google Scholar" /></a>' % (url,base_url,H,H)
|
||
|
elif s >= h:
|
||
|
html += '<a href="%s"><img src="%s/silver.gif" alt="More than %s citations on Google Scholar" title="More than %s citations on Google Scholar" /></a>' % (url,base_url,h,h)
|
||
|
|
||
|
# Only include the velocity if asked.
|
||
|
if velocity:
|
||
|
# Velocity
|
||
|
d = date.today().year - int(years)
|
||
|
if d >= 0:
|
||
|
if 2 < s / (d +1) < 10:
|
||
|
html += '<img src="%s/ups.gif" />' % base_url
|
||
|
if 10 <= s / (d +1):
|
||
|
html += '<img src="%s/upb.gif" />' % base_url
|
||
|
|
||
|
return html
|
||
|
|
||
|
def TestScholarFormat():
|
||
|
# We need to ensure that Google Scholar does not change its page format under our feet
|
||
|
# Use some cases to check if all is good
|
||
|
print "Checking google scholar formats..."
|
||
|
stopAndGoCites = getCite("Stop-and-Go MIXes: Providing Probabilistic Anonymity in an Open System", False)[0]
|
||
|
dragonCites = getCite("Mixes protected by Dragons and Pixies: an empirical study", False, save=False)[0]
|
||
|
|
||
|
if stopAndGoCites in (0, None):
|
||
|
print """OOPS.\n
|
||
|
It looks like Google Scholar changed their URL format or their output format.
|
||
|
I went to count the cites for the Stop-and-Go MIXes paper, and got nothing."""
|
||
|
sys.exit(1)
|
||
|
|
||
|
if dragonCites != None:
|
||
|
print """OOPS.\n
|
||
|
It looks like Google Scholar changed their URL format or their output format.
|
||
|
I went to count the cites for a fictitious paper, and found some."""
|
||
|
sys.exit(1)
|
||
|
|
||
|
def urlIsUseless(u):
|
||
|
if u.find("freehaven.net/anonbib/") >= 0:
|
||
|
# Our own cache is not the primary citation for anything.
|
||
|
return True
|
||
|
elif u.find("owens.mit.edu") >= 0:
|
||
|
# These citations only work for 'members of the MIT community'.
|
||
|
return True
|
||
|
else:
|
||
|
return False
|
||
|
|
||
|
URLTYPES=[ "pdf", "ps", "txt", "ps_gz", "html" ]
|
||
|
|
||
|
if __name__ == '__main__':
|
||
|
# First download the bibliography file.
|
||
|
import BibTeX
|
||
|
suggest = False
|
||
|
if sys.argv[1] == 'suggest':
|
||
|
suggest = True
|
||
|
del sys.argv[1]
|
||
|
|
||
|
config.load(sys.argv[1])
|
||
|
if config.CACHE_UMASK != None:
|
||
|
os.umask(config.CACHE_UMASK)
|
||
|
bib = BibTeX.parseFile(config.MASTER_BIB)
|
||
|
remove_old()
|
||
|
|
||
|
print "Downloading missing ranks."
|
||
|
for ent in bib.entries:
|
||
|
getCite(ent['title'], cache=True, update=True)
|
||
|
|
||
|
if suggest:
|
||
|
for ent in bib.entries:
|
||
|
haveOne = False
|
||
|
for utype in URLTYPES:
|
||
|
if ent.has_key("www_%s_url"%utype):
|
||
|
haveOne = True
|
||
|
break
|
||
|
if haveOne:
|
||
|
continue
|
||
|
print ent.key, "has no URLs given."
|
||
|
urls = [ u for u in getPaperURLs(ent['title']) if not urlIsUseless(u) ]
|
||
|
for u in urls:
|
||
|
print "\t", u
|
||
|
|