arxiv_audio_summary/vibe/fetcher.py

71 lines
2.6 KiB
Python
Raw Permalink Normal View History

2025-03-02 03:22:35 +00:00
import os
import json
import requests
from bs4 import BeautifulSoup
import logging
from .config import ARXIV_CACHE_FILE
logger = logging.getLogger(__name__)
def fetch_arxiv_list(force_refresh=False, arxiv_url=None):
"""
Fetches the latest CS articles from arXiv. If a cache exists, reads from it
unless force_refresh is True. Otherwise, parses the arXiv page, extracts
article metadata, and caches it.
"""
if arxiv_url is None:
from .config import DEFAULT_ARXIV_URL
arxiv_url = DEFAULT_ARXIV_URL
logger.debug("Checking for cached arXIV list at %s", ARXIV_CACHE_FILE)
if not force_refresh and os.path.exists(ARXIV_CACHE_FILE):
logger.info("Cache found for arXiv list. Loading from cache.")
with open(ARXIV_CACHE_FILE, "r", encoding="utf-8") as f:
articles = json.load(f)
logger.debug("Loaded %d articles from cache.", len(articles))
return articles
logger.info("Fetching arXiv page from %s", arxiv_url)
response = requests.get(arxiv_url)
if response.status_code != 200:
logger.error("Failed to fetch arXiv page. Status code: %d", response.status_code)
raise Exception("Failed to fetch arXiv page.")
logger.debug("Parsing arXiv HTML content.")
soup = BeautifulSoup(response.text, "html.parser")
articles = []
dl = soup.find("dl")
if not dl:
logger.error("No article list found on arXiv page.")
raise Exception("No article list found on arXiv page.")
dts = dl.find_all("dt")
dds = dl.find_all("dd")
logger.debug("Found %d dt tags and %d dd tags.", len(dts), len(dds))
for dt, dd in zip(dts, dds):
id_link = dt.find("a", title="Abstract")
if not id_link:
logger.debug("Skipping an article with no abstract link.")
continue
article_id = id_link.text.strip()
pdf_link = dt.find("a", title="Download PDF")
pdf_url = "https://arxiv.org" + pdf_link["href"] if pdf_link else None
title_div = dd.find("div", class_="list-title")
title = title_div.text.replace("Title:", "").strip() if title_div else "No title"
abstract_div = dd.find("p", class_="mathjax")
abstract = abstract_div.text.strip() if abstract_div else "No abstract"
articles.append({
"id": article_id,
"title": title,
"abstract": abstract,
"pdf_url": pdf_url,
})
logger.debug("Parsed article: %s", article_id)
with open(ARXIV_CACHE_FILE, "w", encoding="utf-8") as f:
json.dump(articles, f)
logger.info("Cached %d articles to %s", len(articles), ARXIV_CACHE_FILE)
return articles