arxiv_audio_summary/vibe/fetcher.py

import os
import json
import requests
from bs4 import BeautifulSoup
import logging
from .config import ARXIV_CACHE_FILE

logger = logging.getLogger(__name__)

def fetch_arxiv_list(force_refresh=False, arxiv_url=None):
    """
    Fetches the latest CS articles from arXiv. If a cache exists, reads from it
    unless force_refresh is True. Otherwise, parses the arXiv page, extracts
    article metadata, and caches it.
    """
    if arxiv_url is None:
        from .config import DEFAULT_ARXIV_URL
        arxiv_url = DEFAULT_ARXIV_URL

    logger.debug("Checking for cached arXIV list at %s", ARXIV_CACHE_FILE)
    if not force_refresh and os.path.exists(ARXIV_CACHE_FILE):
        logger.info("Cache found for arXiv list. Loading from cache.")
        with open(ARXIV_CACHE_FILE, "r", encoding="utf-8") as f:
            articles = json.load(f)
        logger.debug("Loaded %d articles from cache.", len(articles))
        return articles

    logger.info("Fetching arXiv page from %s", arxiv_url)
    response = requests.get(arxiv_url)
    if response.status_code != 200:
        logger.error("Failed to fetch arXiv page. Status code: %d", response.status_code)
        raise Exception("Failed to fetch arXiv page.")

    logger.debug("Parsing arXiv HTML content.")
    soup = BeautifulSoup(response.text, "html.parser")
    articles = []
    dl = soup.find("dl")
    if not dl:
        logger.error("No article list found on arXiv page.")
        raise Exception("No article list found on arXiv page.")

    dts = dl.find_all("dt")
    dds = dl.find_all("dd")
    logger.debug("Found %d dt tags and %d dd tags.", len(dts), len(dds))
    for dt, dd in zip(dts, dds):
        id_link = dt.find("a", title="Abstract")
        if not id_link:
            logger.debug("Skipping an article with no abstract link.")
            continue
        article_id = id_link.text.strip()
        pdf_link = dt.find("a", title="Download PDF")
        pdf_url = "https://arxiv.org" + pdf_link["href"] if pdf_link else None

        title_div = dd.find("div", class_="list-title")
        title = title_div.text.replace("Title:", "").strip() if title_div else "No title"

        abstract_div = dd.find("p", class_="mathjax")
        abstract = abstract_div.text.strip() if abstract_div else "No abstract"

        articles.append({
            "id": article_id,
            "title": title,
            "abstract": abstract,
            "pdf_url": pdf_url,
        })
        logger.debug("Parsed article: %s", article_id)

    with open(ARXIV_CACHE_FILE, "w", encoding="utf-8") as f:
        json.dump(articles, f)
    logger.info("Cached %d articles to %s", len(articles), ARXIV_CACHE_FILE)
    return articles
Break project into multiple files 2025-03-02 03:22:35 +00:00			`import os`
			`import json`
			`import requests`
			`from bs4 import BeautifulSoup`
			`import logging`
			`from .config import ARXIV_CACHE_FILE`

			`logger = logging.getLogger(__name__)`

			`def fetch_arxiv_list(force_refresh=False, arxiv_url=None):`
			`"""`
			`Fetches the latest CS articles from arXiv. If a cache exists, reads from it`
			`unless force_refresh is True. Otherwise, parses the arXiv page, extracts`
			`article metadata, and caches it.`
			`"""`
			`if arxiv_url is None:`
			`from .config import DEFAULT_ARXIV_URL`
			`arxiv_url = DEFAULT_ARXIV_URL`

			`logger.debug("Checking for cached arXIV list at %s", ARXIV_CACHE_FILE)`
			`if not force_refresh and os.path.exists(ARXIV_CACHE_FILE):`
			`logger.info("Cache found for arXiv list. Loading from cache.")`
			`with open(ARXIV_CACHE_FILE, "r", encoding="utf-8") as f:`
			`articles = json.load(f)`
			`logger.debug("Loaded %d articles from cache.", len(articles))`
			`return articles`

			`logger.info("Fetching arXiv page from %s", arxiv_url)`
			`response = requests.get(arxiv_url)`
			`if response.status_code != 200:`
			`logger.error("Failed to fetch arXiv page. Status code: %d", response.status_code)`
			`raise Exception("Failed to fetch arXiv page.")`

			`logger.debug("Parsing arXiv HTML content.")`
			`soup = BeautifulSoup(response.text, "html.parser")`
			`articles = []`
			`dl = soup.find("dl")`
			`if not dl:`
			`logger.error("No article list found on arXiv page.")`
			`raise Exception("No article list found on arXiv page.")`

			`dts = dl.find_all("dt")`
			`dds = dl.find_all("dd")`
			`logger.debug("Found %d dt tags and %d dd tags.", len(dts), len(dds))`
			`for dt, dd in zip(dts, dds):`
			`id_link = dt.find("a", title="Abstract")`
			`if not id_link:`
			`logger.debug("Skipping an article with no abstract link.")`
			`continue`
			`article_id = id_link.text.strip()`
			`pdf_link = dt.find("a", title="Download PDF")`
			`pdf_url = "https://arxiv.org" + pdf_link["href"] if pdf_link else None`

			`title_div = dd.find("div", class_="list-title")`
			`title = title_div.text.replace("Title:", "").strip() if title_div else "No title"`

			`abstract_div = dd.find("p", class_="mathjax")`
			`abstract = abstract_div.text.strip() if abstract_div else "No abstract"`

			`articles.append({`
			`"id": article_id,`
			`"title": title,`
			`"abstract": abstract,`
			`"pdf_url": pdf_url,`
			`})`
			`logger.debug("Parsed article: %s", article_id)`

			`with open(ARXIV_CACHE_FILE, "w", encoding="utf-8") as f:`
			`json.dump(articles, f)`
			`logger.info("Cached %d articles to %s", len(articles), ARXIV_CACHE_FILE)`
			`return articles`