# -*- coding: utf-8 -*-
################################################################################
# #
# This file is part of the Bibolamazi Project. #
# Copyright (C) 2013 by Philippe Faist #
# philippe.faist@bluewin.ch #
# #
# Bibolamazi is free software: you can redistribute it and/or modify #
# it under the terms of the GNU General Public License as published by #
# the Free Software Foundation, either version 3 of the License, or #
# (at your option) any later version. #
# #
# Bibolamazi is distributed in the hope that it will be useful, #
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
# GNU General Public License for more details. #
# #
# You should have received a copy of the GNU General Public License #
# along with Bibolamazi. If not, see <http://www.gnu.org/licenses/>. #
# #
################################################################################
import re
from urllib.error import URLError, HTTPError
import textwrap
import time
import logging
logger = logging.getLogger(__name__)
import arxiv2bib
from bibolamazi.core.bibusercache import BibUserCacheAccessor, BibUserCacheError
from bibolamazi.core.bibusercache.tokencheckers import EntryFieldsTokenChecker
from bibolamazi.core import butils
[docs]class BibArxivApiFetchError(BibUserCacheError):
def __init__(self, msg):
super().__init__('arxiv_fetched_api_info', msg)
#
# --- code to detect arXiv info ---
#
_RX_BEFORE = r'(?:\s*([;,]?\s*)|\b|\s+|^)'
_RX_AFTER = r'(?:\s*[;,]?\s*|$)'
_RX_PRIMARY_CLASS_PAT = r'[-a-zA-Z0-9\._]+'
# only the numerical arxiv ID (+possible version)
_RX_ARXIVID_NUM_PAT = r'(?<!\d)(?:\d{4}\.\d{4,}|\d{7})(?:v\d+)?'
_RX_ARXIVID_NUM = r'(?P<arxivid>'+_RX_ARXIVID_NUM_PAT+r')'
# allow primary-class/ etc.
_RX_ARXIVID_TOL = r'(?P<arxivid>(?:'+_RX_PRIMARY_CLASS_PAT+r'/)?'+_RX_ARXIVID_NUM_PAT+r')'
def _mk_braced_pair_rx(mid):
return [ re.compile(_RX_BEFORE + r'\{\s*' + mid + r'\s*\}' + _RX_AFTER, re.IGNORECASE) ,
re.compile(_RX_BEFORE + mid + _RX_AFTER, re.IGNORECASE) ]
# A list of regexes that we will need often.
#
# The following are regexes we check for in url fields. Don't include all
# regexes, because some DOI or parts of URLs may contain sequences of chars
# which match the easier arXiv regexes.
_rxarxiv_in_url = \
[] + \
_mk_braced_pair_rx(
r'\\href\s*\{\s*(?:https?://)?arxiv\.org/(?:abs|pdf)/'
+ _RX_ARXIVID_TOL + r'\s*\}\s*\{[^\{\}]*\}'
) + \
_mk_braced_pair_rx(
r'\\(?:url|href)\s*\{\s*(?:https?://)?arxiv\.org/(?:abs|pdf)/'
+ _RX_ARXIVID_TOL + r's*\}'
) + \
_mk_braced_pair_rx(
r'(?:https?://)?arxiv\.org/(?:abs|pdf)/' + _RX_ARXIVID_TOL + r's*'
)
# And these regexes are the most tolerant ones, we'll check for these more or
# less everywhere except in the URL fields.
_rxarxiv = _rxarxiv_in_url + \
_mk_braced_pair_rx(
r'(?:https?://)?arxiv\.org/(?:abs|pdf)/' + _RX_ARXIVID_TOL
) + \
_mk_braced_pair_rx(
r'(?:arXiv[-.:/\s]+)?((?:(?P<primaryclass>'
+ _RX_PRIMARY_CLASS_PAT + r')/)?' + _RX_ARXIVID_NUM + r')'
+ r'(?:\s*\[(?P<primaryclass2>' + _RX_PRIMARY_CLASS_PAT + r')\])?'
)
# getting "pure" arxiv ID means the arxiv ID (with primary class for old IDs
# only), without version information.
_rx_purearxivid = re.compile(r'(?P<purearxivid>((\d{4}\.\d{4,})|'+
r'('+_RX_PRIMARY_CLASS_PAT+r'/\d{7}))(v\d+)?)', re.IGNORECASE)
_rx_aid_year = re.compile(r'(?P<year>\d{2})(?P<mon>\d{2})(?:\.\d{4,}|\d{3})')
rx_arxiv_own_doi = re.compile(r'^((https?://)?(dx\.)?(doi\.org/))?10\.48550/arXiv\.(?P<arxivid>.*)$', re.IGNORECASE)
#
# A list of fields which are inspected for arXiv information. This is useful for
# cache invalidation in various instances.
#
arxivinfo_from_bibtex_fields = [
'journal', 'doi', 'eprint', 'arxivid', 'arxiv', 'url',
'note', 'annote', 'primaryclass',
'archiveprefix', ]
# extract arXiv info from an entry
[docs]def detectEntryArXivInfo(entry):
"""
Extract arXiv information from a `pybtex.database.Entry` bibliographic
entry.
Returns upon success a dictionary of the form::
{ 'primaryclass': <primary class, if available>,
'arxivid': <the (minimal) arXiv ID (in format XXXX.XXXX or archive/XXXXXXX)>,
'archiveprefix': value of the 'archiveprefix' field
'published': True/False <whether this entry was published in a journal other than arxiv>,
'doi': <DOI of entry if any, otherwise None>
'year': <Year in preprint arXiv ID number. 4-digit, string type.>
'isoldarxivid': <Whether the arXiv ID is of old style, i.e. 'primary-class/XXXXXXX'>
'isnewarxivid': <Whether the arXiv ID is of new style, i.e. 'XXXX.XXXX+' (with 4 or more digits after dot)>,
}
Note that 'published' is set to True for PhD and Master's thesis. Also, the
`arxiv` filter handles this case separately and explicitly, the option there
`-dThesesCountAsPublished=0` has no effect here.
If no arXiv information was detected, then this function returns None.
"""
fields = entry.fields
d = { 'primaryclass': None ,
'arxivid': None ,
'published': True ,
'archiveprefix': None,
'doi': None,
'year': None,
'isoldarxivid': None,
'isnewarxivid': None,
}
#
# NOTE: If you add/change the fields that are used here, make sure you
# update the EntryFieldsTokenChecker below!
#
if (entry.type == u'unpublished' or entry.type == u'misc'):
d['published'] = False
elif entry.type in (u'phdthesis', u'mastersthesis',):
# by default, PhD theses and Master's thesis count as published
# (although this case is handled specially in the arxiv filter)
d['published'] = True
elif entry.type in (u'book', u'booksection', u'inproceedings',
u'incollection', u'conference',
u'inbook', u'proceedings',):
# proceedings, books, etc. are published
d['published'] = True
elif ('journal' in fields and re.search(r'arxiv', fields['journal'], re.IGNORECASE)):
# if journal is the arXiv, then it's not published.
d['published'] = False
elif ('journal' in fields and fields['journal'].strip()):
# otherwise, if there is a journal, it's published
d['published'] = True
elif ('journal' not in fields or fields['journal'].strip() == ""):
# if there's no journal for an article or an unknown publication type,
# it's the arxiv.
d['published'] = False
else:
logger.longdebug('No decisive information about whether this entry is published: %s (type %s), '
'defaulting to True.', entry.key, entry.type)
def extract_pure_id(x, primaryclass=None):
m = _rx_purearxivid.search( (primaryclass+'/' if primaryclass else "") + x)
if m is None:
raise IndexError
return m.group('purearxivid')
if 'doi' in fields:
dois = re.split(r'[ \t\n,]+', fields['doi'])
for doi in dois:
if doi.strip() == "":
continue
m = re.match(rx_arxiv_own_doi, doi)
if m is not None:
# get arXiv ID
d['arxivid'] = m.group('arxivid')
else:
# this is a journal DOI -- keep only first one
if d.get('doi', None) is not None:
d['doi'] = fields['doi']
if d['arxivid'] is None:
for eprintfield in ('arxivid', 'arxiv', 'eprint'):
if not eprintfield in fields:
continue
# this field might reveal the arxiv ID
arxivid = None
try:
arxivid = extract_pure_id(fields[eprintfield], primaryclass=fields.get('primaryclass', None))
except IndexError as e:
logger.longdebug("Indexerror: invalid arXiv ID in field ‘%s’ [%r/]%r: %s",
eprintfield, fields.get('primaryclass',None), fields[eprintfield], e)
# could be because, e.g., Zotero exporter used the PubMed ID here.
logger.debug("Entry `%s' has invalid arXiv ID %r in eprint field ‘%s’", entry.key, fields[eprintfield],
eprintfield)
continue
if arxivid is not None:
d['arxivid'] = arxivid
m = re.match(r'^([-\w.]+)/', arxivid)
if (m):
d['primaryclass'] = m.group(1)
break
if ('primaryclass' in fields):
d['primaryclass'] = fields['primaryclass']
if ('archiveprefix' in fields):
d['archiveprefix'] = fields['archiveprefix']
logger.longdebug("processed doi,eprint,arxiv,arxivid,primaryclass,archiveprefix fields -> d = %r", d)
def processNoteField(notefield, d, isurl=False):
if isurl:
rxlist = _rxarxiv_in_url
else:
rxlist = _rxarxiv
for rx in rxlist:
m = rx.search(notefield)
if m:
#logger.longdebug("Note field %r: arxiv rx %r matched", notefield, rx.pattern)
if (not d['arxivid']):
try:
primaryclass = None
try: primaryclass = m.group('primaryclass2')
except IndexError: pass
try: primaryclass = m.group('primaryclass')
except IndexError: pass
#logger.longdebug("arxivid (maybe with prim-class) = %r", m.group('arxivid'))
d['arxivid'] = extract_pure_id(m.group('arxivid'), primaryclass=primaryclass)
except IndexError as e:
logger.longdebug("indexerror while getting arxivid in note=%r, m=%r: %s",
notefield, m, e)
if (not d['primaryclass']):
primaryclass = None
try:
primaryclass = m.group('primaryclass')
except IndexError:
pass
try:
primaryclass = m.group('primaryclass2')
except IndexError:
pass
if primaryclass and primaryclass not in ['abs', 'pdf', 'abs/', 'pdf/']:
d['primaryclass'] = primaryclass
if d['arxivid'] and d['primaryclass']:
return
#logger.longdebug("d = %r", d)
if ('note' in fields):
processNoteField(fields['note'], d)
if ('annote' in fields):
processNoteField(fields['annote'], d)
if ('url' in fields):
processNoteField(fields['url'], d, isurl=True)
logger.longdebug("processed note,annote,url fields -> d = %r", d)
if (d['arxivid'] is None):
# no arXiv info.
return None
# FIX: if archive-ID is old style, and does not contain the primary class, add it as "quant-ph/XXXXXXX"
if (re.match(r'^\d{7}$', d['arxivid']) and d['primaryclass'] and len(d['primaryclass']) > 0):
d['arxivid'] = d['primaryclass']+'/'+d['arxivid']
# set whether old style or new style arXiv ID
if re.match(r'^\d{4}\.\d{4,}(v\d+)?$', d['arxivid']):
d['isoldarxivid'] = False
d['isnewarxivid'] = True
elif re.match(r'^'+_RX_PRIMARY_CLASS_PAT+r'/\d{7}(v\d+)?$', d['arxivid']):
d['isoldarxivid'] = True
d['isnewarxivid'] = False
else:
d['isoldarxivid'] = False # can't determine arxiv ID style ...
d['isnewarxivid'] = False # can't determine arxiv ID style ...
# get the year
m = _rx_aid_year.search(d['arxivid'])
if not m:
logger.warning("Couldn't find the year in arXiv ID %r", d['arxivid'])
else:
# 91->1991, 89->2089 (arXiv started in 1991)
d['year'] = str(1990 + (int(m.group('year')) - 90) % 100)
logger.longdebug("finished detection -> d = %r", d)
return d
[docs]def stripArXivInfoInNote(notestr):
"""
Assumes that notestr is a string in a note={} field of a bibtex entry, and
strips any arxiv identifier information found, e.g. of the form
'arxiv:XXXX.YYYY' (or similar).
"""
newnotestr = notestr
for rx in _rxarxiv:
# replace all occurences of rx's in _rxarxiv with nothing.
newnotestr = rx.sub('', newnotestr)
if (notestr != newnotestr):
logger.longdebug("stripArXivInfoInNote: stripped %r to %r", notestr, newnotestr)
return newnotestr
# ---- API info ------
[docs]class ArxivFetchedAPIInfoCacheAccessor(BibUserCacheAccessor):
"""
A `BibUserCacheAccessor` for fetching and accessing information retrieved
from the arXiv API.
"""
def __init__(self, **kwargs):
super().__init__(
cache_name='arxiv_fetched_api_info',
**kwargs
)
# save arxiv IDs for which we couldn't retreive information because the
# server responded with an error.
#
# For these IDs, we do not re-attempt to fetch them right away (to avoid
# multiple doomed requests), so that a new attempt is made only next
# time bibolamazi is run.
self.error_arxivids = {}
[docs] def initialize(self, cache_obj, **kwargs):
dic = self.cacheDic()
dic.setdefault('fetched', {})
#logger.longdebug("dic is %r\n"
# "id(dic['fetched'])=%r", dic, id(dic['fetched']))
logger.debug("arxiv_fetched_api_info: adding validation checker; time valid is %r",
cache_obj.cacheExpirationTokenChecker().time_valid)
# validate each entry with an expiration checker. Do this per entry, rather than
# globally on the full cache. (So don't use installCacheExpirationChecker())
dic['fetched'].set_validation(cache_obj.cacheExpirationTokenChecker())
[docs] def fetchArxivApiInfo(self, idlist):
"""
Populates the given cache with information about the arXiv entries given in
`idlist`. This must be, yes you guessed right, a list of arXiv
identifiers that we should fetch.
This function performs a query on the arXiv.org API, using the arxiv2bib
library. Please note that you should avoid making rapid fire requests
in a row (this should normally not happen anyway thanks to our cache
mechanism). However, beware that if we get a ``403 Forbidden`` HTTP
answer, we should not continue or else arXiv.org might interpret our
requests as a DOS attack. If a ``403 Forbidden`` HTTP answer is received
this function raises :py:exc:`BibArxivApiFetchError` with a meaningful
error text.
Only those entries in `idlist` which are not already in the cache are
fetched.
`idlist` can be any iterable.
"""
logger.longdebug("fetchArxivApiInfo(): idlist=%r", idlist)
# first, see which IDs of the idlist we actually need to fetch
cache_entrydic = self.cacheDic()['fetched']
logger.longdebug("fetchArxivApiInfo(): "
"id(dic['fetched'])=%r, \n"
"id(self.cacheObject().cachedic['arxiv_fetched_api_info']=%r\n"
"len(dic['fetched'])=%d",
id(cache_entrydic), id(self.cacheObject().cachedic['arxiv_fetched_api_info']),
len(cache_entrydic))
logger.longdebug("fetchArxivApiInfo(): in the cache, we have keys %r",
cache_entrydic.keys())
still_to_fetch = []
for aid in idlist:
if aid in self.error_arxivids:
logger.debug("Not re-trying to fetch info for %s, query failed moments ago", aid)
# we already tried to fetch this ID moments ago but failed---don't insist
continue
if (aid not in cache_entrydic or cache_entrydic.get(aid) is None or
cache_entrydic.get(aid).get('error', False)):
still_to_fetch.append(aid)
logger.longdebug("fetchArxivApiInfo(): still_to_fetch=%r", still_to_fetch)
# make sure we're not requesting more than batch_len arxiv ids at a time
# (or URLs can get too long and we'll get a HTTP 414 "URL too long"
# response)
batch_len = 64
sleep_interval = 1 # 1 req/second: see https://groups.google.com/d/msg/arxiv-api/wcPh0w38XN0/p7vKsxjb6ykJ
num_batches, rest = divmod(len(still_to_fetch), batch_len)
if rest:
num_batches += 1
k = 0
logger.longdebug("fetchArxivApiInfo(): We need to fetch keys: %r", still_to_fetch)
while still_to_fetch:
thisbatch = still_to_fetch[:batch_len]
logger.info("Fetching information from arXiv.org (%d/%d)", k+1, num_batches)
# At the second time, wait a little while. Don't make rapid fire
# requests to the arxiv because they don't like that:
# https://arxiv.org/help/robots
if k > 0:
time.sleep(sleep_interval)
ok = self._do_fetch_arxiv_api_info(thisbatch)
if not ok:
# logs
logger.info("Fetching information from arXiv.org failed :(")
return False
k += 1
still_to_fetch = still_to_fetch[len(thisbatch):]
if k > 0:
# message only if we actually fetched anything
logger.info("Fetching information from arXiv.org done.")
return True
def _do_fetch_arxiv_api_info(self, idlist):
if ArxivFetchedAPIInfoCacheAccessor.arxiv_403_received:
logger.warning("Not fetching any more arXiv data because we've been "
"previously sent a \"HTTP 403 Forbidden\" response. "
"See https://arxiv.org/help/robots")
return None
cache_entrydic = self.cacheDic()['fetched']
logger.debug('fetching missing id list %r', idlist)
try:
arxivdict = arxiv2bib.arxiv2bib_dict(idlist)
# USE FOR DEBUGGING:
#arxivdict = {}
#logger.critical("DEACTIVATED ARXIV QUERY FOR DEBUGGING")
except HTTPError as error:
if error.getcode() == 403:
ArxivFetchedAPIInfoCacheAccessor.arxiv_403_received = True
raise BibArxivApiFetchError(
textwrap.dedent("""\
Error fetching ArXiv API Info: ** 403 Forbidden **
This usually happens when you make many rapid fire requests in a
row. If you continue to do this, arXiv.org may interpret your requests
as a denial of service attack.
For more information, see https://arxiv.org/help/robots.
"""))
logger.warning("HTTP connection error %d: %s.", error.code, error.reason)
logger.warning("ArXiv API information will not be retrieved, and your bibliography "
"might be incomplete.")
return False
except URLError as error:
logger.warning("Error fetching info from arXiv.org: %s.", error.reason)
logger.warning("ArXiv API information will not be retrieved, and your bibliography "
"might be incomplete.")
return False
logger.longdebug('got entries %r: %r' %(arxivdict.keys(), arxivdict))
for (k,ref) in arxivdict.items():
logger.longdebug("Got reference object for id %s: %r" %(k, ref.__dict__))
cache_entrydic[k]['reference'] = ref
if ref is None or isinstance(ref, arxiv2bib.ReferenceErrorInfo):
errorstr = '<UNKNOWN ERROR>' if ref is None else str(ref)
self.error_arxivids[k] = errorstr
cache_entrydic[k]['error'] = errorstr
cache_entrydic[k]['bibtex'] = ''
else:
cache_entrydic[k]['error'] = None
bibtex = ref.bibtex()
cache_entrydic[k]['bibtex'] = bibtex
logger.longdebug("arxiv api info: Got all references. cacheDic() is now: %r", self.cacheDic())
logger.longdebug("... and cacheObject().cachedic is now: %r", self.cacheObject().cachedic)
return True
[docs] def getArxivApiInfo(self, arxivid):
"""
Returns a dictionary::
{
'reference': <arxiv2bib.Reference>,
'bibtex': <bibtex string>,
'error': <None or an error string>,
}
for the given arXiv id in the cache. If the information is not in the cache,
returns `None`.
Don't forget to first call :py:meth:`fetchArxivApiInfo()` to retrieve the
information in the first place.
Note the reference part may be a
:py:class:`arxiv2bib.ReferenceErrorInfo`, if there was an error
retreiving the reference. In that case, the key 'error' contains an
error string.
"""
return self.cacheDic()['fetched'].get(arxivid, None)
ArxivFetchedAPIInfoCacheAccessor.arxiv_403_received = False
[docs]class ArxivInfoCacheAccessor(BibUserCacheAccessor):
"""
Cache accessor for detected arXiv information about bibliography entries.
"""
def __init__(self, **kwargs):
super().__init__(
cache_name='arxiv_info',
**kwargs
)
# save bibtex keys for which we couldn't retreive information because
# the API accessor returned an error. We do this to avoid re-attempting
# to query information during this run of bibolamazi; Don't save this to
# cache, we should re-attempt next time bibolamazi is run.
self.failed_keys = []
[docs] def initialize(self, cache_obj, **kwargs):
cache_dic = self.cacheDic()
cache_dic['entries'].set_validation(
EntryFieldsTokenChecker(self.bibolamaziFile().bibliographyData(),
store_type=True,
fields=arxivinfo_from_bibtex_fields)
)
cache_dic.setdefault('cache_built', False)
[docs] def rebuild_cache(self, bibdata, arxiv_api_accessor):
"""
Clear and rebuild the entry cache completely.
"""
entrydic = self.cacheDic()['entries']
entrydic.clear()
self.complete_cache(bibdata, arxiv_api_accessor)
[docs] def revalidate(self, bibolamazifile):
"""
Re-validates the cache (with validate()), and calls again complete_cache()
to fetch all missing or out-of-date entries.
"""
self.cacheDic()['entries'].validate()
self.complete_cache(
bibolamazifile.bibliographyData(),
bibolamazifile.cacheAccessor(ArxivFetchedAPIInfoCacheAccessor)
)
[docs] def complete_cache(self, bibdata, arxiv_api_accessor):
"""
Makes sure the cache is complete for all items in `bibdata`.
"""
entrydic = self.cacheDic()['entries']
# A list if pairs (citekey, arxiv-id) of entries that still need to be completed
# with info from the arXiv API.
needs_to_be_completed = []
summary_info_mismatch = []
#
# Do a first scan through all the bibdata entries, and detect the API
# information using only what we have (to figure out the arxiv
# ID!). We'll do a query to the arXiv API in a second step below.
#
for k,v in bibdata.entries.items():
# arxiv info is in cache and updated with info fetched from the arXiv API
if (k in entrydic and entrydic[k] is not None and
(entrydic[k].get('updated_with_api_info', False) or k in self.failed_keys)):
continue
# else, there's something to refresh and update
arinfo = detectEntryArXivInfo(v)
entrydic[k] = arinfo
logger.longdebug("detected arXiv information for `%s': %r", k, arinfo)
if arinfo is not None:
needs_to_be_completed.append( (k, arinfo['arxivid'],) )
logger.longdebug("complete_cache(): needs_to_be_completed=%r\nentrydic=%r\n",
needs_to_be_completed,
entrydic)
#
# Complete the entry arXiv info using fetched info from the arXiv API.
#
arxiv_api_accessor.fetchArxivApiInfo( (x[1] for x in needs_to_be_completed), )
fail_aids = []
for (k,aid) in needs_to_be_completed:
api_info = arxiv_api_accessor.getArxivApiInfo(aid)
if (api_info is None or api_info['error'] or 'reference' not in api_info):
errstr = ""
if api_info:
errstr = ": " + api_info['error']
logger.debug("Failed to fetch arXiv information for %s%s", aid, errstr)
fail_aids.append(aid)
self.failed_keys.append(k)
continue
ref = api_info['reference']
logger.longdebug("%s: %s: api_info is %r, ref is %r", k, aid, api_info, ref)
primaryclass = ref.category
try:
doi = ref._field_text('doi', namespace=arxiv2bib.ARXIV)
except:
logger.debug("Couldn't get DOI field for %s from %r", aid, ref)
if (primaryclass and entrydic[k]['primaryclass'] and
# compare overlap only, so that 'cond-mat' and
# 'cond-mat.stat-mech' don't generate the warning
entrydic[k]['primaryclass'][:len(primaryclass)] !=
primaryclass[:len(entrydic[k]['primaryclass'])]):
#
# ### Ignore mismatches in primaryclass, e.g. Zotero's exporter
# ### exports all archive classes as a comma-separated list
# ### which would be terrible for us to parse and check...
#
# summary_info_mismatch.append(
# (k, aid, 'primaryclass', entrydic[k]['primaryclass'], primaryclass)
# )
# logger.warning("Conflicting primaryclass values for entry %s (%s): "
# "%s (given in bibtex) != %s (retrieved from the arxiv)",
# k, aid, entrydic[k]['primaryclass'], primaryclass)
pass
else:
entrydic[k]['primaryclass'] = primaryclass
if (doi and entrydic[k]['doi'] and entrydic[k]['doi'].lower() != doi.lower()):
summary_info_mismatch.append(
(k, aid, 'doi', entrydic[k]['doi'], doi)
)
# logger.warning("Conflicting doi values for entry %s (%s): "
# "%s (given in bibtex) != %s (retrieved from the arxiv)",
# k, aid, entrydic[k]['doi'], doi)
else:
entrydic[k]['doi'] = doi
entrydic[k]['updated_with_api_info'] = True
if fail_aids:
joined = ", ".join(fail_aids if len(fail_aids) <= 8 else fail_aids[:7]+['...'])
logger.warning("Failed to fetch information from the arXiv for %d entries: %s",
len(fail_aids), joined)
# warning for info mismatch
if summary_info_mismatch:
logger.warning(
"Mismatch: info in bibtex ≠ info from arxiv.org\n" +
"\n".join(
"- ‘{key}’ ({arxivid}) [{field}]:\n"
" {value_from_bibtex} ≠ {value_from_arxivorg}"
.format(key=key,
arxivid=arxivid,
field=field,
value_from_bibtex='“'+value_from_bibtex+'”',
value_from_arxivorg='“'+value_from_arxivorg+'”',)
for key, arxivid, field, value_from_bibtex, value_from_arxivorg
in summary_info_mismatch
)
)
[docs] def getArXivInfo(self, entrykey):
"""
Get the arXiv information corresponding to entry citekey `entrykey`. If the entry
is not in the cache, returns `None`. Call `complete_cache()` first!
"""
logger.longdebug("Getting arxiv info for key %r from cache.", entrykey)
entrydic = self.cacheDic()['entries']
if (entrykey not in entrydic):
logger.longdebug(" --> not found :(")
return None
return entrydic.get(entrykey, None)
#def _reference_doi(self, ref):
# try:
# doi = ref._field_text('doi', namespace=arxiv2bib.ARXIV)
# except:
# return None
# if (doi):
# return doi
# return None
#
#def _reference_category(self, ref):
# try:
# return ref.category
# except AttributeError:
# # happens for ReferenceErrorInfo, for example
# return None
[docs]def setup_and_get_arxiv_accessor(bibolamazifile):
arxivinfoaccessor = bibolamazifile.cacheAccessor(ArxivInfoCacheAccessor)
arxivinfoaccessor.complete_cache(
bibolamazifile.bibliographyData(),
bibolamazifile.cacheAccessor(ArxivFetchedAPIInfoCacheAccessor)
)
return arxivinfoaccessor
# deprecated:
[docs]def get_arxiv_cache_access(bibolamazifile):
butils.warn_deprecated(None, "get_arxiv_cache_access()", "setup_and_get_arxiv_accessor()",
modulename="arxivutil.py",
explanation="We now use the new cache mechanism; your filter should "
"also explicitly request the cache accessors ArxivInfoCacheAccessor "
"and ArxivFetchedAPIInfoCacheAccessor so that the cache is correctly "
"set up.")
return setup_and_get_arxiv_accessor(bibolamazifile)