60 lines
No EOL
2.5 KiB
Python
60 lines
No EOL
2.5 KiB
Python
import os
|
|
import json
|
|
import tempfile
|
|
import requests
|
|
import logging
|
|
import subprocess
|
|
from docling.document_converter import DocumentConverter, PdfFormatOption
|
|
from docling.datamodel.pipeline_options import PdfPipelineOptions
|
|
from docling.datamodel.base_models import InputFormat
|
|
|
|
from .config import ARTICLES_CACHE_DIR
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
pdf_options = PdfFormatOption(pipeline_options=PdfPipelineOptions(generate_picture_images=True))
|
|
doc_converter = DocumentConverter(format_options={InputFormat.PDF: pdf_options})
|
|
|
|
def fetch_and_convert_article(article):
|
|
"""
|
|
Checks for a cached conversion of the article.
|
|
If absent, downloads the PDF, converts it using Docling,
|
|
caches the Markdown text, and returns it.
|
|
"""
|
|
safe_id = article["id"].replace(":", "_")
|
|
cache_file = os.path.join(ARTICLES_CACHE_DIR, f"{safe_id}.txt")
|
|
logger.debug("Checking for cached conversion of article '%s'.", article["id"])
|
|
if os.path.exists(cache_file):
|
|
logger.info("Found cached conversion for article '%s'.", article["id"])
|
|
with open(cache_file, "r", encoding="utf-8") as f:
|
|
return f.read()
|
|
|
|
if not article["pdf_url"]:
|
|
logger.error("No PDF URL for article '%s'. Skipping conversion.", article["id"])
|
|
return ""
|
|
logger.info("Downloading PDF for article '%s' from %s", article["id"], article["pdf_url"])
|
|
response = requests.get(article["pdf_url"])
|
|
if response.status_code != 200:
|
|
logger.error("Failed to download PDF for article '%s'.", article["id"])
|
|
return ""
|
|
|
|
with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as tmp_pdf:
|
|
tmp_pdf.write(response.content)
|
|
tmp_pdf_path = tmp_pdf.name
|
|
logger.debug("PDF saved temporarily at %s", tmp_pdf_path)
|
|
|
|
try:
|
|
logger.info("Converting PDF for article '%s' using Docling.", article["id"])
|
|
conv_result = doc_converter.convert(source=tmp_pdf_path)
|
|
converted_text = conv_result.document.export_to_markdown()
|
|
with open(cache_file, "w", encoding="utf-8") as f:
|
|
f.write(converted_text)
|
|
logger.info("Conversion successful for article '%s'. Cached output.", article["id"])
|
|
return converted_text
|
|
except Exception as e:
|
|
logger.exception("Conversion failed for article '%s': %s", article["id"], e)
|
|
return ""
|
|
finally:
|
|
if os.path.exists(tmp_pdf_path):
|
|
os.unlink(tmp_pdf_path)
|
|
logger.debug("Temporary PDF file %s removed.", tmp_pdf_path) |