arxiv_audio_summary/vibe/converter.py

import os
import json
import tempfile
import requests
import logging
import subprocess
from docling.document_converter import DocumentConverter, PdfFormatOption
from docling.datamodel.pipeline_options import PdfPipelineOptions
from docling.datamodel.base_models import InputFormat

from .config import ARTICLES_CACHE_DIR

logger = logging.getLogger(__name__)

pdf_options = PdfFormatOption(pipeline_options=PdfPipelineOptions(generate_picture_images=True))
doc_converter = DocumentConverter(format_options={InputFormat.PDF: pdf_options})

def fetch_and_convert_article(article):
    """
    Checks for a cached conversion of the article.
    If absent, downloads the PDF, converts it using Docling,
    caches the Markdown text, and returns it.
    """
    safe_id = article["id"].replace(":", "_")
    cache_file = os.path.join(ARTICLES_CACHE_DIR, f"{safe_id}.txt")
    logger.debug("Checking for cached conversion of article '%s'.", article["id"])
    if os.path.exists(cache_file):
        logger.info("Found cached conversion for article '%s'.", article["id"])
        with open(cache_file, "r", encoding="utf-8") as f:
            return f.read()

    if not article["pdf_url"]:
        logger.error("No PDF URL for article '%s'. Skipping conversion.", article["id"])
        return ""
    logger.info("Downloading PDF for article '%s' from %s", article["id"], article["pdf_url"])
    response = requests.get(article["pdf_url"])
    if response.status_code != 200:
        logger.error("Failed to download PDF for article '%s'.", article["id"])
        return ""

    with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as tmp_pdf:
        tmp_pdf.write(response.content)
        tmp_pdf_path = tmp_pdf.name
    logger.debug("PDF saved temporarily at %s", tmp_pdf_path)

    try:
        logger.info("Converting PDF for article '%s' using Docling.", article["id"])
        conv_result = doc_converter.convert(source=tmp_pdf_path)
        converted_text = conv_result.document.export_to_markdown()
        with open(cache_file, "w", encoding="utf-8") as f:
            f.write(converted_text)
        logger.info("Conversion successful for article '%s'. Cached output.", article["id"])
        return converted_text
    except Exception as e:
        logger.exception("Conversion failed for article '%s': %s", article["id"], e)
        return ""
    finally:
        if os.path.exists(tmp_pdf_path):
            os.unlink(tmp_pdf_path)
            logger.debug("Temporary PDF file %s removed.", tmp_pdf_path)