Source code for abstracts_explorer.plugin

"""
Plugin Framework
================

This module provides the plugin framework for extending neurips-abstracts
with custom data downloaders.

The framework consists of:
- Base classes for plugin implementation (DownloaderPlugin, LightweightDownloaderPlugin)
- Schema conversion utilities (convert_to_lightweight_schema)
- Plugin registry for managing plugins (PluginRegistry)
- Pydantic models for data validation (LightweightPaper)
"""

import logging
from abc import ABC, abstractmethod
from datetime import datetime
from typing import Any, Dict, List, Optional

import requests
from pydantic import BaseModel, ValidationError, field_validator

logger = logging.getLogger(__name__)



[docs]
class DownloaderPlugin(ABC):
    """
    Base class for all downloader plugins.

    Each plugin must implement the download method and provide metadata
    about its capabilities.

    Subclasses should set ``_start_year`` and override ``get_url(year)``
    to get automatic ``supported_years`` computation with a current-year
    availability check.
    """

    # Plugin metadata (should be overridden in subclasses)
    plugin_name: str = "base"
    plugin_description: str = "Base downloader plugin"
    _start_year: int = 0

    @property
    def supported_years(self) -> List[int]:
        """
        Dynamically computed supported years.

        Builds the range ``[_start_year, current_year)`` and appends the
        current year when its data URL is already accessible (checked via
        a HEAD request to ``get_url(current_year)``).

        Returns
        -------
        list of int
            Supported conference years.
        """
        if not hasattr(self, "_supported_years_cache"):
            if self._start_year > 0:
                current_year = datetime.now().year
                years = list(range(self._start_year, current_year))
                if self._check_current_year_available(current_year):
                    years.append(current_year)
                self._supported_years_cache: List[int] = years
            else:
                self._supported_years_cache = []
        return self._supported_years_cache

    @supported_years.setter
    def supported_years(self, value: List[int]) -> None:
        self._supported_years_cache = value


[docs]
    def get_url(self, year: int) -> str:
        """
        Get the data URL for a specific year.

        Override in subclasses to enable automatic current-year availability
        checking in :attr:`supported_years`.

        Parameters
        ----------
        year : int
            Conference year

        Returns
        -------
        str
            URL used for downloading or probing availability.

        Raises
        ------
        NotImplementedError
            When the subclass does not provide an implementation.
        """
        raise NotImplementedError("Subclasses must implement get_url()")


    def _check_current_year_available(self, current_year: int) -> bool:
        """
        Check whether the data URL for *current_year* is already reachable.

        Sends a HEAD request to ``get_url(current_year)`` with a 3-second
        timeout.  Returns ``False`` when ``get_url`` is not implemented or
        the request fails.

        Parameters
        ----------
        current_year : int
            Year to probe.

        Returns
        -------
        bool
            ``True`` when the URL responds with HTTP 200.
        """
        try:
            url = self.get_url(current_year)
            response = requests.head(url, timeout=3, allow_redirects=True)
            return response.status_code == 200
        except (requests.RequestException, NotImplementedError):
            logger.debug(
                "%s: current-year availability check failed for year %d",
                self.plugin_name,
                current_year,
            )
            return False


[docs]
    @abstractmethod
    def download(
        self,
        year: Optional[int] = None,
        output_path: Optional[str] = None,
        force_download: bool = False,
        **kwargs: Any,
    ) -> List["LightweightPaper"]:
        """
        Download papers from the data source.

        Parameters
        ----------
        year : int, optional
            Year to download papers for (if applicable)
        output_path : str, optional
            Path to save the downloaded data
        force_download : bool
            Force re-download even if data exists
        **kwargs : Any
            Additional plugin-specific parameters

        Returns
        -------
        list of LightweightPaper
            List of validated paper objects ready for database insertion
        """
        pass



[docs]
    @abstractmethod
    def get_metadata(self) -> Dict[str, Any]:
        """
        Get plugin metadata.

        Returns
        -------
        dict
            Plugin metadata including name, description, supported years, etc.
        """
        pass



[docs]
    def validate_year(self, year: Optional[int]) -> None:
        """
        Validate that the requested year is supported.

        Parameters
        ----------
        year : int or None
            Year to validate

        Raises
        ------
        ValueError
            If year is not supported by this plugin
        """
        if year is not None and self.supported_years and year not in self.supported_years:
            raise ValueError(
                f"Year {year} not supported by {self.plugin_name}. " f"Supported years: {self.supported_years}"
            )





[docs]
class LightweightDownloaderPlugin(DownloaderPlugin):
    """
    Lightweight base class for downloader plugins using simplified schema.

    This plugin type uses a simpler data format that only requires essential fields,
    making it easier to implement new plugins. The data is automatically converted
    to the full NeurIPS schema when loaded into the database.

    Required fields per paper:
        - title (str): Paper title
        - authors (list): List of author names (strings) or author dicts with 'fullname'
        - abstract (str): Paper abstract
        - session (str): Session/workshop/track name
        - poster_position (str): Poster position identifier
        - year (int): Conference year (e.g., 2025)
        - conference (str): Conference name (e.g., "NeurIPS", "ICLR")

    Optional fields per paper:
        - paper_pdf_url (str): URL to paper PDF
        - poster_image_url (str): URL to poster image
        - url (str): General URL (e.g., OpenReview, ArXiv)
        - room_name (str): Room name for presentation
        - keywords (list): List of keywords/tags
        - starttime (str): Start time (ISO format or readable string)
        - endtime (str): End time (ISO format or readable string)
        - id (int): Paper ID (auto-generated if not provided)
        - award (str): Award name (e.g., "Best Paper Award")
    """

    # Plugin metadata (should be overridden in subclasses)
    plugin_name: str = "lightweight_base"
    plugin_description: str = "Lightweight base downloader plugin"



"""
Schema Converter
================

Utilities for converting between lightweight and full NeurIPS schema formats.
"""



[docs]
def convert_to_lightweight_schema(papers: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
    """
    Convert full NeurIPS schema to lightweight paper format.

    This function extracts only the fields needed for the lightweight schema
    from papers in the full NeurIPS format, making it easier to work with
    simplified data structures.

    Parameters
    ----------
    papers : list
        List of papers in full NeurIPS format with fields like:
        - id (int)
        - title or name (str) - will use 'title', fallback to 'name'
        - authors (list of dict with 'fullname' or list of str)
        - abstract (str)
        - session (str)
        - poster_position (str)
        - paper_pdf_url (str, optional)
        - poster_image_url (str, optional)
        - url (str, optional)
        - room_name (str, optional)
        - keywords (list or str, optional)
        - starttime (str, optional)
        - endtime (str, optional)
        - award or decision (str, optional)
        - year (int, optional)
        - conference (str, optional)

    Returns
    -------
    list of dict
        Papers in lightweight format with only essential fields.
        Authors are returned as lists of strings.

    Examples
    --------
    >>> papers = [
    ...     {
    ...         'id': 123,
    ...         'title': 'Deep Learning',
    ...         'authors': [
    ...             {'id': 1, 'fullname': 'John Doe', 'institution': 'MIT'},
    ...             {'id': 2, 'fullname': 'Jane Smith', 'institution': 'Stanford'}
    ...         ],
    ...         'abstract': 'A paper about deep learning',
    ...         'session': 'Session A',
    ...         'poster_position': 'A-42',
    ...         'paper_pdf_url': 'https://example.com/paper.pdf',
    ...         'year': 2025,
    ...         'conference': 'NeurIPS'
    ...     }
    ... ]
    >>> lightweight = convert_to_lightweight_schema(papers)
    >>> lightweight[0]['authors']
    ['John Doe', 'Jane Smith']

    Notes
    -----
    - Author objects are converted to lists of name strings
    - Extra NeurIPS-specific fields are dropped
    - 'name' field is converted to 'title' if needed
    - Keywords are converted from string to list if needed
    """
    lightweight_papers = []

    for paper in papers:
        # Extract title (handle both 'title' and legacy 'name')
        title = paper.get("title") or paper.get("name", "")
        if not title:
            continue  # Skip papers without title

        # Extract and convert authors to list
        authors_data = paper.get("authors", [])
        authors = []

        if isinstance(authors_data, list):
            for author in authors_data:
                if isinstance(author, dict):
                    # Extract fullname from dict
                    name = author.get("fullname") or author.get("name", "")
                    if name:
                        authors.append(name)
                elif isinstance(author, str):
                    # Already a string
                    authors.append(author)
        elif isinstance(authors_data, str):
            # Authors is a semicolon-separated string, split it
            authors = [a.strip() for a in authors_data.split(";") if a.strip()]

        # Sanitize author names to remove semicolons (required by LightweightPaper validation)
        authors = sanitize_author_names(authors)

        # Build lightweight paper
        # Note: Use 'or ""' pattern to handle None values from source data
        lightweight_paper = {
            "title": title,
            "authors": authors,
            "abstract": paper.get("abstract") or "",
            "session": paper.get("session") or "No session",
            "poster_position": paper.get("poster_position") or "",
            "year": paper.get("year") or 0,
            "conference": paper.get("conference") or "",
        }

        # Add optional fields if present
        if "id" in paper:
            lightweight_paper["original_id"] = paper["id"]

        if paper.get("paper_pdf_url"):
            lightweight_paper["paper_pdf_url"] = paper["paper_pdf_url"]
        elif paper.get("paper_url"):
            lightweight_paper["paper_pdf_url"] = paper["paper_url"]

        if paper.get("poster_image_url"):
            lightweight_paper["poster_image_url"] = paper["poster_image_url"]

        if paper.get("url"):
            lightweight_paper["url"] = paper["url"]

        if paper.get("room_name"):
            lightweight_paper["room_name"] = paper["room_name"]

        # Handle keywords (can be list or string in NeurIPS schema)
        keywords = paper.get("keywords")
        if keywords:
            if isinstance(keywords, str):
                # Convert string to list
                lightweight_paper["keywords"] = [k.strip() for k in keywords.split(",") if k.strip()]
            elif isinstance(keywords, list):
                lightweight_paper["keywords"] = keywords

        if paper.get("starttime"):
            lightweight_paper["starttime"] = paper["starttime"]

        if paper.get("endtime"):
            lightweight_paper["endtime"] = paper["endtime"]

        # Use award if present, otherwise fall back to decision
        decision = paper.get("decision") or ""
        award = paper.get("award") or (paper.get("decision") if "award" in decision.lower() else None)
        if award:
            lightweight_paper["award"] = award

        lightweight_papers.append(lightweight_paper)

    return lightweight_papers



"""
Plugin Registry
===============

Registry for managing and accessing downloader plugins.
"""

# DownloaderPlugin is defined earlier in this file

logger = logging.getLogger(__name__)



[docs]
class PluginRegistry:
    """Registry for managing downloader plugins."""


[docs]
    def __init__(self):
        self._plugins: Dict[str, DownloaderPlugin] = {}



[docs]
    def register(self, plugin: DownloaderPlugin) -> None:
        """
        Register a new plugin.

        Parameters
        ----------
        plugin : DownloaderPlugin
            Plugin instance to register
        """
        if not isinstance(plugin, DownloaderPlugin):
            raise TypeError(f"Plugin must be an instance of DownloaderPlugin, got {type(plugin)}")

        self._plugins[plugin.plugin_name] = plugin
        logger.info(f"Registered plugin: {plugin.plugin_name}")



[docs]
    def unregister(self, plugin_name: str) -> None:
        """
        Unregister a plugin.

        Parameters
        ----------
        plugin_name : str
            Name of plugin to unregister
        """
        if plugin_name in self._plugins:
            del self._plugins[plugin_name]
            logger.info(f"Unregistered plugin: {plugin_name}")
        else:
            logger.warning(f"Plugin not found: {plugin_name}")



[docs]
    def get(self, plugin_name: str) -> Optional[DownloaderPlugin]:
        """
        Get a plugin by name.

        Parameters
        ----------
        plugin_name : str
            Name of plugin to retrieve

        Returns
        -------
        DownloaderPlugin or None
            Plugin instance or None if not found
        """
        return self._plugins.get(plugin_name)



[docs]
    def list_plugins(self) -> List[Dict[str, Any]]:
        """
        List all registered plugins with their metadata.

        Returns
        -------
        list
            List of plugin metadata dictionaries
        """
        return [plugin.get_metadata() for plugin in self._plugins.values()]



[docs]
    def list_plugin_names(self) -> List[str]:
        """
        List names of all registered plugins.

        Returns
        -------
        list
            List of plugin names
        """
        return list(self._plugins.keys())




# Global plugin registry
_registry = PluginRegistry()



[docs]
def register_plugin(plugin: DownloaderPlugin) -> None:
    """
    Register a plugin with the global registry.

    Parameters
    ----------
    plugin : DownloaderPlugin
        Plugin instance to register
    """
    _registry.register(plugin)




[docs]
def get_plugin(plugin_name: str) -> Optional[DownloaderPlugin]:
    """
    Get a plugin from the global registry.

    Parameters
    ----------
    plugin_name : str
        Name of plugin to retrieve

    Returns
    -------
    DownloaderPlugin or None
        Plugin instance or None if not found
    """
    return _registry.get(plugin_name)




[docs]
def list_plugins() -> List[Dict[str, Any]]:
    """
    List all registered plugins.

    Returns
    -------
    list
        List of plugin metadata dictionaries
    """
    return _registry.list_plugins()




[docs]
def list_plugin_names() -> List[str]:
    """
    List names of all registered plugins.

    Returns
    -------
    list
        List of plugin names
    """
    return _registry.list_plugin_names()



def get_all_plugins() -> List[DownloaderPlugin]:
    """
    Get all registered plugin instances.

    Returns
    -------
    list of DownloaderPlugin
        List of all registered plugin instances
    """
    plugins = []
    for name in _registry.list_plugin_names():
        plugin = _registry.get(name)
        if plugin is not None:
            plugins.append(plugin)
    return plugins


def get_available_filters() -> Dict[str, Any]:
    """
    Get available conferences and years from registered plugins.

    Returns a mapping of conferences to their supported years based on
    the registered downloader plugins.

    Returns
    -------
    dict
        Dictionary with:
        - conferences: list of conference names
        - years: list of all unique years across all plugins
        - conference_years: dict mapping conference names to their supported years

    Examples
    --------
    >>> filters = get_available_filters()
    >>> print(filters['conferences'])  # ['NeurIPS', 'ICLR', ...]
    >>> print(filters['years'])  # [2025, 2024, 2023, ...]
    >>> print(filters['conference_years'])  # {'NeurIPS': [2025, 2024], ...}
    """
    # Get all registered plugins
    plugins = list_plugins()

    # Build mapping of conferences to years
    conference_years: Dict[str, List[int]] = {}
    all_years: set = set()

    for plugin_info in plugins:
        conference_name = plugin_info.get("conference_name")
        supported_years = plugin_info.get("supported_years", [])

        if conference_name and supported_years:
            if conference_name not in conference_years:
                conference_years[conference_name] = []
            conference_years[conference_name].extend(supported_years)
            all_years.update(supported_years)

    # Sort years and deduplicate
    all_years_sorted = sorted(list(all_years), reverse=True)
    conferences = sorted(conference_years.keys())

    # Sort years for each conference
    for conf in conference_years:
        conference_years[conf] = sorted(conference_years[conf], reverse=True)

    return {"conferences": conferences, "years": all_years_sorted, "conference_years": conference_years}


def resolve_conference_from_url(url_path: str) -> Optional[str]:
    """
    Resolve a URL path segment to a canonical conference name using plugins.

    Performs case-insensitive matching against:
    1. Plugin conference names (e.g. ``NeurIPS``, ``ICLR``)
    2. Plugin names (e.g. ``neurips``, ``iclr``)

    Parameters
    ----------
    url_path : str
        URL path segment to resolve (e.g. ``"neurips"``, ``"ICLR"``).

    Returns
    -------
    str or None
        The canonical conference name if a match is found, ``None`` otherwise.

    Examples
    --------
    >>> resolve_conference_from_url("neurips")  # matches plugin name
    'NeurIPS'
    >>> resolve_conference_from_url("ICLR")  # matches conference name
    'ICLR'
    >>> resolve_conference_from_url("unknown") is None
    True
    """
    lookup: Dict[str, str] = {}
    available = get_available_filters()
    for conf in available.get("conferences", []):
        lookup[conf.lower()] = conf

    for meta in list_plugins():
        plugin_name = meta.get("name", "")
        conf_name = meta.get("conference_name", "")
        if plugin_name and conf_name:
            lookup[plugin_name.lower()] = conf_name

    return lookup.get(url_path.lower())


"""
Plugin Data Models
==================

Pydantic models for validating plugin data in lightweight schema format.
"""


# ============================================================================
# Lightweight Schema Models (for LightweightDownloaderPlugin)
# ============================================================================



[docs]
class LightweightPaper(BaseModel):
    """
    Lightweight paper model for plugin data validation.

    This model validates the simplified schema used by LightweightDownloaderPlugin.
    It requires only essential fields and optionally supports additional metadata.

    Required Fields
    ---------------
    title : str
        Paper title
    authors : list
        List of author names (strings)
    abstract : str
        Paper abstract
    session : str
        Session/workshop/track name
    poster_position : str
        Poster position identifier
    year : int
        Conference year (e.g., 2025)
    conference : str
        Conference name (e.g., "NeurIPS", "ML4PS")

    Optional Fields
    ---------------
    original_id : int
        Paper ID from the original source
    paper_pdf_url : str
        URL to paper PDF
    poster_image_url : str
        URL to poster image
    url : str
        General URL (OpenReview, ArXiv, etc.)
    room_name : str
        Room name for presentation
    keywords : list
        List of keywords/tags
    starttime : str
        Start time
    endtime : str
        End time
    award : str
        Award name (e.g., "Best Paper Award")
    """

    # Required fields
    title: str
    authors: List[str]
    abstract: str
    session: str
    poster_position: str
    year: int
    conference: str

    # Optional fields
    original_id: Optional[int] = None
    paper_pdf_url: Optional[str] = None
    poster_image_url: Optional[str] = None
    url: Optional[str] = None
    room_name: Optional[str] = None
    keywords: Optional[List[str]] = None
    starttime: Optional[str] = None
    endtime: Optional[str] = None
    award: Optional[str] = None


[docs]
    @field_validator("title")
    @classmethod
    def validate_title(cls, v: str) -> str:
        """Ensure title is not empty."""
        if not v or not v.strip():
            raise ValueError("Paper title cannot be empty")
        return v.strip()



[docs]
    @field_validator("authors")
    @classmethod
    def validate_authors(cls, v: List[str]) -> List[str]:
        """Ensure authors list is not empty and properly formatted.

        Empty or whitespace-only author entries are silently filtered out so
        that a single malformed entry does not cause the entire paper to be
        rejected.  A ``ValueError`` is still raised when the list is empty
        after filtering, or when any remaining entry contains a semicolon.
        """
        # Filter out empty/whitespace-only names rather than rejecting the paper
        filtered = [author for author in v if author.strip()]
        if not filtered:
            raise ValueError("Authors list cannot be empty")
        for author in filtered:
            # no semicolons allowed in author names
            if ";" in author:
                raise ValueError("Author names cannot contain semicolons")
        return filtered



[docs]
    @field_validator("abstract")
    @classmethod
    def validate_abstract(cls, v: str) -> str:
        """Ensure abstract is not empty."""
        if not v or not v.strip():
            raise ValueError("Paper abstract cannot be empty")
        return v.strip()



[docs]
    @field_validator("session")
    @classmethod
    def validate_session(cls, v: str) -> str:
        """Ensure session is not empty."""
        if not v or not v.strip():
            raise ValueError("Session cannot be empty")
        return v.strip()



[docs]
    @field_validator("conference")
    @classmethod
    def validate_conference(cls, v: str) -> str:
        """Ensure conference is not empty."""
        if not v or not v.strip():
            raise ValueError("Conference cannot be empty")
        return v.strip()



[docs]
    @field_validator("year")
    @classmethod
    def validate_year(cls, v: int) -> int:
        """Ensure year is reasonable."""
        if v < 1900 or v > 2100:
            raise ValueError(f"Year {v} is not reasonable (must be between 1900 and 2100)")
        return v




# ============================================================================
# Validation Helper Functions
# ============================================================================



[docs]
def sanitize_author_names(authors: List[str]) -> List[str]:
    """
    Filter out semicolons from author names.

    Semicolons are not allowed in author names because they would interfere
    with the semicolon-separated format used to store authors in the database.
    This function replaces semicolons with spaces and normalizes whitespace.

    Parameters
    ----------
    authors : list of str
        List of author names to sanitize

    Returns
    -------
    list of str
        List of author names with semicolons replaced by spaces

    Examples
    --------
    >>> sanitize_author_names(["John Doe", "Jane; Smith", "Bob;Johnson"])
    ['John Doe', 'Jane Smith', 'Bob Johnson']

    >>> sanitize_author_names(["Alice"])
    ['Alice']

    >>> sanitize_author_names([])
    []

    >>> sanitize_author_names(["Multi;;Semicolons"])
    ['Multi Semicolons']

    Notes
    -----
    This function is useful when importing data from sources that may contain
    semicolons in author names. The LightweightPaper model will reject author
    names containing semicolons during validation.

    Multiple consecutive spaces are normalized to a single space.
    """
    import re

    return [re.sub(r"\s+", " ", author.replace(";", " ")).strip() for author in authors]



def serialize_authors_to_string(authors: List[str]) -> str:
    """
    Serialize a list of author names to a semicolon-separated string.

    Used by both :func:`~abstracts_explorer.database.DatabaseManager.add_paper`
    and :meth:`~abstracts_explorer.embeddings.EmbeddingsManager._serialize_metadata_for_chromadb`
    to ensure a consistent storage format in both the SQL database and ChromaDB.

    Parameters
    ----------
    authors : list of str
        Author names to serialize.  An empty list or ``None`` returns an empty
        string.

    Returns
    -------
    str
        Semicolon-separated author names, e.g. ``"Alice; Bob; Charlie"``.
        Returns an empty string if *authors* is empty or ``None``.
    """
    if not authors:
        return ""
    return "; ".join(str(a).strip() for a in authors)


def deserialize_authors_from_string(authors_str: str) -> List[str]:
    """
    Deserialize a semicolon-separated authors string to a list of author names.

    Used when loading paper metadata from ChromaDB to convert the stored
    semicolon-separated string back into a list format for validation and use in
    the application.

    Parameters
    ----------
    authors_str : str
        Semicolon-separated authors string, e.g. ``"Alice; Bob; Charlie"``.
        An empty string returns an empty list.

    Returns
    -------
    list of str
        List of author names, e.g. ``["Alice", "Bob", "Charlie"]``.
        Returns an empty list if *authors_str* is empty or only contains whitespace.
    """
    if not authors_str or not authors_str.strip():
        return []
    return [a.strip() for a in authors_str.split(";") if a.strip()]


def serialize_keywords_to_string(keywords: List[str]) -> str:
    """
    Serialize a list of keywords to a comma-separated string.

    Used by both :func:`~abstracts_explorer.database.DatabaseManager.add_paper`
    and :meth:`~abstracts_explorer.embeddings.EmbeddingsManager._serialize_metadata_for_chromadb`
    to ensure a consistent storage format in both the SQL database and ChromaDB.

    Parameters
    ----------
    keywords : list of str
        Keywords to serialize.  An empty list or ``None`` returns an empty
        string.

    Returns
    -------
    str
        Comma-separated keywords, e.g. ``"machine learning, deep learning"``.
        Returns an empty string if *keywords* is empty or ``None``.
    """
    if not keywords:
        return ""
    return ", ".join(str(k).strip() for k in keywords)


def deserialize_keywords_from_string(keywords_str: str) -> List[str]:
    """
    Deserialize a comma-separated keywords string to a list of keywords.

    Used when loading paper metadata from ChromaDB to convert the stored
    comma-separated string back into a list format for validation and use in
    the application.

    Parameters
    ----------
    keywords_str : str
        Comma-separated keywords string, e.g. ``"machine learning, deep learning"``.
        An empty string returns an empty list.

    Returns
    -------
    list of str
        List of keywords, e.g. ``["machine learning", "deep learning"]``.
        Returns an empty list if *keywords_str* is empty or only contains whitespace.
    """
    if not keywords_str or not keywords_str.strip():
        return []
    return [k.strip() for k in keywords_str.split(",") if k.strip()]


def prepare_chroma_db_paper_data(paper: Dict[str, Any]) -> Dict[str, Any]:
    """
    Prepare paper data from chroma_db for validation by normalizing and converting fields.

    Parameters
    ----------
    paper : dict
        Paper data to prepare

    Returns
    -------
    dict
        Prepared paper data
    """
    # Split authors and keywords into lists, stripping whitespace from each entry
    paper["authors"] = [a.strip() for a in paper["authors"].split(";") if a.strip()]
    paper["year"] = int(paper["year"])
    if "keywords" in paper:
        kws = [k.strip() for k in paper["keywords"].split(",") if k.strip()]
        if kws:
            paper["keywords"] = kws
        else:
            # Empty string → remove so LightweightPaper defaults keywords to None
            del paper["keywords"]
    if "original_id" in paper:
        if paper["original_id"]:
            paper["original_id"] = int(paper["original_id"])
        else:
            del paper["original_id"]
    # Remove empty strings for optional string fields so they default to None in
    # LightweightPaper, preserving round-trip fidelity with the SQL database (where
    # None values are serialised to "" by _serialize_metadata_for_chromadb).
    for field in ("paper_pdf_url", "poster_image_url", "url", "room_name", "starttime", "endtime", "award"):
        if field in paper and paper[field] == "":
            del paper[field]
    return paper



[docs]
def validate_lightweight_paper(paper: Dict[str, Any]) -> LightweightPaper:
    """
    Validate a paper dict against the lightweight schema.

    Parameters
    ----------
    paper : dict
        Paper data to validate

    Returns
    -------
    LightweightPaper
        Validated paper model

    Raises
    ------
    ValidationError
        If the paper data is invalid
    """
    return LightweightPaper(**paper)




[docs]
def validate_lightweight_papers(papers: List[Dict[str, Any]]) -> List[LightweightPaper]:
    """
    Validate a list of papers against the lightweight schema.

    Papers that fail validation are logged as warnings and skipped
    rather than aborting the entire import.

    Parameters
    ----------
    papers : list
        List of paper dicts to validate

    Returns
    -------
    list of LightweightPaper
        List of validated paper models (papers that failed validation are excluded)
    """
    validated: List[LightweightPaper] = []
    for paper in papers:
        try:
            validated.append(validate_lightweight_paper(paper))
        except ValidationError as exc:
            title = paper.get("title", "<unknown>")
            logger.warning("Skipping paper '%s': validation failed: %s", title, exc)
    if len(validated) < len(papers):
        logger.warning("Skipped %d of %d papers due to validation errors", len(papers) - len(validated), len(papers))
    return validated



# Export public API
__all__ = [
    # Plugin base classes
    "DownloaderPlugin",
    "LightweightDownloaderPlugin",
    # Registry
    "PluginRegistry",
    "register_plugin",
    "get_plugin",
    "list_plugins",
    "list_plugin_names",
    # Conversion utilities
    "convert_to_lightweight_schema",
    # Pydantic models
    "LightweightPaper",
    # Validation functions
    "sanitize_author_names",
    "validate_lightweight_paper",
    "validate_lightweight_papers",
]