71 lines
2.6 KiB
Python
71 lines
2.6 KiB
Python
|
|
import os
|
||
|
|
import json
|
||
|
|
import requests
|
||
|
|
from bs4 import BeautifulSoup
|
||
|
|
import logging
|
||
|
|
from .config import ARXIV_CACHE_FILE
|
||
|
|
|
||
|
|
logger = logging.getLogger(__name__)
|
||
|
|
|
||
|
|
def fetch_arxiv_list(force_refresh=False, arxiv_url=None):
|
||
|
|
"""
|
||
|
|
Fetches the latest CS articles from arXiv. If a cache exists, reads from it
|
||
|
|
unless force_refresh is True. Otherwise, parses the arXiv page, extracts
|
||
|
|
article metadata, and caches it.
|
||
|
|
"""
|
||
|
|
if arxiv_url is None:
|
||
|
|
from .config import DEFAULT_ARXIV_URL
|
||
|
|
arxiv_url = DEFAULT_ARXIV_URL
|
||
|
|
|
||
|
|
logger.debug("Checking for cached arXIV list at %s", ARXIV_CACHE_FILE)
|
||
|
|
if not force_refresh and os.path.exists(ARXIV_CACHE_FILE):
|
||
|
|
logger.info("Cache found for arXiv list. Loading from cache.")
|
||
|
|
with open(ARXIV_CACHE_FILE, "r", encoding="utf-8") as f:
|
||
|
|
articles = json.load(f)
|
||
|
|
logger.debug("Loaded %d articles from cache.", len(articles))
|
||
|
|
return articles
|
||
|
|
|
||
|
|
logger.info("Fetching arXiv page from %s", arxiv_url)
|
||
|
|
response = requests.get(arxiv_url)
|
||
|
|
if response.status_code != 200:
|
||
|
|
logger.error("Failed to fetch arXiv page. Status code: %d", response.status_code)
|
||
|
|
raise Exception("Failed to fetch arXiv page.")
|
||
|
|
|
||
|
|
logger.debug("Parsing arXiv HTML content.")
|
||
|
|
soup = BeautifulSoup(response.text, "html.parser")
|
||
|
|
articles = []
|
||
|
|
dl = soup.find("dl")
|
||
|
|
if not dl:
|
||
|
|
logger.error("No article list found on arXiv page.")
|
||
|
|
raise Exception("No article list found on arXiv page.")
|
||
|
|
|
||
|
|
dts = dl.find_all("dt")
|
||
|
|
dds = dl.find_all("dd")
|
||
|
|
logger.debug("Found %d dt tags and %d dd tags.", len(dts), len(dds))
|
||
|
|
for dt, dd in zip(dts, dds):
|
||
|
|
id_link = dt.find("a", title="Abstract")
|
||
|
|
if not id_link:
|
||
|
|
logger.debug("Skipping an article with no abstract link.")
|
||
|
|
continue
|
||
|
|
article_id = id_link.text.strip()
|
||
|
|
pdf_link = dt.find("a", title="Download PDF")
|
||
|
|
pdf_url = "https://arxiv.org" + pdf_link["href"] if pdf_link else None
|
||
|
|
|
||
|
|
title_div = dd.find("div", class_="list-title")
|
||
|
|
title = title_div.text.replace("Title:", "").strip() if title_div else "No title"
|
||
|
|
|
||
|
|
abstract_div = dd.find("p", class_="mathjax")
|
||
|
|
abstract = abstract_div.text.strip() if abstract_div else "No abstract"
|
||
|
|
|
||
|
|
articles.append({
|
||
|
|
"id": article_id,
|
||
|
|
"title": title,
|
||
|
|
"abstract": abstract,
|
||
|
|
"pdf_url": pdf_url,
|
||
|
|
})
|
||
|
|
logger.debug("Parsed article: %s", article_id)
|
||
|
|
|
||
|
|
with open(ARXIV_CACHE_FILE, "w", encoding="utf-8") as f:
|
||
|
|
json.dump(articles, f)
|
||
|
|
logger.info("Cached %d articles to %s", len(articles), ARXIV_CACHE_FILE)
|
||
|
|
return articles
|