arxiv_audio_summary/vibe/converter.py
2025-03-02 03:22:35 +00:00

60 lines
No EOL
2.5 KiB
Python

import os
import json
import tempfile
import requests
import logging
import subprocess
from docling.document_converter import DocumentConverter, PdfFormatOption
from docling.datamodel.pipeline_options import PdfPipelineOptions
from docling.datamodel.base_models import InputFormat
from .config import ARTICLES_CACHE_DIR
logger = logging.getLogger(__name__)
pdf_options = PdfFormatOption(pipeline_options=PdfPipelineOptions(generate_picture_images=True))
doc_converter = DocumentConverter(format_options={InputFormat.PDF: pdf_options})
def fetch_and_convert_article(article):
"""
Checks for a cached conversion of the article.
If absent, downloads the PDF, converts it using Docling,
caches the Markdown text, and returns it.
"""
safe_id = article["id"].replace(":", "_")
cache_file = os.path.join(ARTICLES_CACHE_DIR, f"{safe_id}.txt")
logger.debug("Checking for cached conversion of article '%s'.", article["id"])
if os.path.exists(cache_file):
logger.info("Found cached conversion for article '%s'.", article["id"])
with open(cache_file, "r", encoding="utf-8") as f:
return f.read()
if not article["pdf_url"]:
logger.error("No PDF URL for article '%s'. Skipping conversion.", article["id"])
return ""
logger.info("Downloading PDF for article '%s' from %s", article["id"], article["pdf_url"])
response = requests.get(article["pdf_url"])
if response.status_code != 200:
logger.error("Failed to download PDF for article '%s'.", article["id"])
return ""
with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as tmp_pdf:
tmp_pdf.write(response.content)
tmp_pdf_path = tmp_pdf.name
logger.debug("PDF saved temporarily at %s", tmp_pdf_path)
try:
logger.info("Converting PDF for article '%s' using Docling.", article["id"])
conv_result = doc_converter.convert(source=tmp_pdf_path)
converted_text = conv_result.document.export_to_markdown()
with open(cache_file, "w", encoding="utf-8") as f:
f.write(converted_text)
logger.info("Conversion successful for article '%s'. Cached output.", article["id"])
return converted_text
except Exception as e:
logger.exception("Conversion failed for article '%s': %s", article["id"], e)
return ""
finally:
if os.path.exists(tmp_pdf_path):
os.unlink(tmp_pdf_path)
logger.debug("Temporary PDF file %s removed.", tmp_pdf_path)