Source code for arxivcheck.arxiv

"""
main.py
====================================
The core module of my example project
"""

from __future__ import unicode_literals, print_function, absolute_import
from builtins import input
import feedparser
from doi2bib.crossref import get_bib_from_doi
from bibtexparser.bwriter import BibTexWriter
from bibtexparser.bibdatabase import BibDatabase
try:
    from urllib import quote
except ImportError:
    from urllib.parse import quote
import re
from unidecode import unidecode
bare_url = "http://export.arxiv.org/api/query"


months = [
    'jan', 'feb', 'mar', 'apr', 'may', 'jun', 'jul', 'aug', 'sep', 'oct',
    'nov', 'dec']


def ask_which_is(title, items):
    found = False
    result = {}
    question = "\n\tArxiv:{} \n\tIt is \n\t{}\n\t Correct?y(yes)|n(no)|q(quit)"
    for item in items:
        w = input(question.format(
            unidecode(item["title"]), unidecode(title)))
        if w == "y":
            found = True
            result = item
            break
        if w == "q":
            break

    return found, result


[docs]def get_arxiv_info(value, field="id"):
    """Get arxiv information given

    Parameters
    ----------
    value : str
        value of the field
    field : str
        the field used for build the query string

    Returns
    -------
    found : bool
        True if at least one arxiv has been found
    items : list of dicts
        List containing all the arxiv's related
        with the search query
    """
    found = False
    items = []
    params = "?search_query="+field+":"+quote(unidecode(value))
    url = bare_url+params
    result = feedparser.parse(url)
    items = result.entries
    found = len(items) > 0

    return found, items


[docs]def add_eprint_to_bib(bib, eprint):
    """
    Insert the eprint information in a given bibtex string

    Parameters
    ----------
        bib: str
            The bibtex string without the arxiv number
        eprint: str
            The arxiv number

    Returns
    -------
        bib: str
            The bibtex string with the arxiv number
    """
    def bibtex_error():
        raise RuntimeError("CrossRef returned badly formed BibTeX file.")

    firstbrace = bib.find('{')
    if firstbrace == -1:
        bibtex_error()

    firstcomma = bib.find(',', firstbrace)
    if firstcomma == -1:
        bibtex_error()

    firstnewline = bib.find('\n', firstcomma)
    if firstnewline == -1:
        bibtex_error()

    bib = (bib[0:firstnewline] + '\n' + 
           '        eprint={' + eprint + '},\n' +
           '        archiveprefix={arXiv},' +
           bib[firstnewline:])

    return bib


[docs]def generate_bib_from_arxiv(arxiv_item, value, field="id"):
    """

    Parameters
    ----------
        arxiv_item: dict
        value: str
        field: str

    Returns
    -------
        bib: str
            The bibtex string related with the arxiv item
    """
    # arxiv_cat = arxiv_item.arxiv_primary_category["term"]
    if field == "ti":
        journal = "arxiv:"
        journal += arxiv_item["id"].split("http://arxiv.org/abs/")[1]
    else:
        journal = "arxiv:"+value

    url = arxiv_item.link
    title = arxiv_item.title
    authors = arxiv_item.authors
    if len(authors) > 0:
        first_author = authors[0]["name"].split(" ")
        authors = " and ".join([author["name"] for author in authors])
    else:
        first_author = authors
        authors = authors

    published = arxiv_item.published.split("-")
    year = ''
    if len(published) > 1:
        year = published[0]
    bib = BibDatabase()
    bib.entries = [
        {
            "journal": journal,
            "url": url,
            "ID": year+first_author[0]+journal,
            "title": title,
            "year": year,
            "author": authors,
            "ENTRYTYPE": "article"
        }
    ]
    bib = BibTexWriter().write(bib)

    return bib


def get_arxiv_pdf_link(value, field="id"):
    link = None
    value = re.sub("arxiv\:", "", value, flags=re.I)
    found, items = get_arxiv_info(value, field)
    if found:
        arxiv_item = items[0]
        pdf_item = list(
            filter(
                lambda i: i["type"] == "application/pdf",
                arxiv_item.links
            )
        )
        found = len(pdf_item) > 0
        link = pdf_item[0]["href"] if found else None

    return found, link


[docs]def check_arxiv_published(
        value, field="id", get_first=True, keep_eprint=False):
    """

    Parameters
    ----------
        value: str
            value of the field
        field: str
            field used for the arxiv search API
        get_first: bool
        keep_eprint: bool
            If True keep the arxiv number if the paper 
            has already been published

    Returns
    -------
        found: bool
            True if found the arxiv item
        published: bool
            True if the arxiv has already been
            published
        bib: str
            bibtext string
    """

    found = False
    published = False
    bib = ""
    value = re.sub("arxiv\:", "", value, flags=re.I)
    found, items = get_arxiv_info(value, field)
    if found:
        if field == "ti":
            title = value.lower().replace(" ", "")
            for item_arxiv in items:
                title_arxiv = item_arxiv["title"].lower().replace(" ", "").replace("\n", "")
                if title_arxiv == title:
                    items = [item_arxiv]
                    break
        
        if get_first is False and field == "ti" and len(items) > 1:
            found, item = ask_which_is(value, items)
        else:
            item = items[0]

    if found:
        if "arxiv_doi" in item:
            doi = item["arxiv_doi"]
            published, bib = get_bib_from_doi(doi)
            if keep_eprint:
                eprint = re.split('/|v',item["id"])[-2]
                bib = add_eprint_to_bib(bib, eprint)
        else:
            bib = generate_bib_from_arxiv(item, value, field)

    else:
        print("\t\nArxiv not found.")

    return found, published, bib