Source code for abstracts_explorer.paper_utils
"""
Paper formatting utilities for NeurIPS abstracts.
This module provides shared utilities for formatting papers from various sources
(database, search results, ChromaDB) with consistent structure and error handling.
"""
import logging
from typing import Any, Dict, List
logger = logging.getLogger(__name__)
# TF-IDF settings for keyword extraction from search results.
_TFIDF_MAX_FEATURES = 500
_TFIDF_SMALL_CORPUS_THRESHOLD = 3 # corpora smaller than this use min_df=1
_TFIDF_MIN_DF_SMALL = 1
_TFIDF_MIN_DF_REGULAR = 2
[docs]
class PaperFormattingError(Exception):
"""Exception raised when paper formatting fails."""
pass
[docs]
def format_search_results(
search_results: Dict[str, Any],
database,
include_documents: bool = True,
) -> List[Dict[str, Any]]:
"""
Format ChromaDB search results into complete paper records.
Converts search results from ChromaDB into fully-populated paper dictionaries
by fetching complete data from the database. Fails early if required data
is missing rather than returning incomplete records.
Parameters
----------
search_results : dict
Search results from ChromaDB with 'ids', 'distances', 'metadatas', 'documents'.
database : DatabaseManager
Database instance for fetching complete paper details.
include_documents : bool, optional
Whether to include document text (abstract) from search results, by default True.
Returns
-------
list
List of complete paper dictionaries with authors, similarity scores, and all fields.
Raises
------
PaperFormattingError
If search_results format is invalid or required data is missing.
Examples
--------
>>> results = embeddings_manager.search_similar("transformers", n_results=5)
>>> papers = format_search_results(results, database)
>>> for paper in papers:
... print(paper['title'], paper['similarity'])
"""
# Validate search results structure
if not isinstance(search_results, dict):
raise PaperFormattingError("search_results must be a dictionary.")
if "ids" not in search_results or not search_results["ids"]:
raise PaperFormattingError("search_results must contain 'ids' field with results.")
if not search_results["ids"][0]:
# Empty results - valid but return empty list
return []
if database is None:
raise PaperFormattingError("Database connection is required but not provided.")
# Validate consistent result lengths
result_count = len(search_results["ids"][0])
if "distances" in search_results and search_results["distances"][0]:
if len(search_results["distances"][0]) != result_count:
raise PaperFormattingError("Inconsistent result lengths in search_results.")
if "documents" in search_results and search_results["documents"][0]:
if len(search_results["documents"][0]) != result_count:
raise PaperFormattingError("Inconsistent result lengths in search_results.")
papers = []
for i in range(result_count):
paper_uid = search_results["ids"][0][i]
try:
# ChromaDB stores UIDs as strings - use them directly
if not isinstance(paper_uid, str):
logger.warning(f"Invalid paper_uid format: {paper_uid} ({type(paper_uid)})")
continue
# Get complete paper from database (this validates paper exists)
paper = database.get_paper_by_uid(paper_uid)
if paper is None:
raise PaperFormattingError(f"Paper with uid={paper_uid} not found in database.")
# Add similarity/distance scores if available
if "distances" in search_results and search_results["distances"][0]:
distance = search_results["distances"][0][i]
paper["distance"] = distance
# Convert distance to similarity (0-1 range where 1 is most similar)
paper["similarity"] = max(0.0, 1.0 - distance)
# Add abstract from search results if needed and available
if include_documents and "documents" in search_results and search_results["documents"][0]:
# Only override if database abstract is missing
if not paper.get("abstract"):
paper["abstract"] = search_results["documents"][0][i]
papers.append(paper)
except PaperFormattingError as e:
# Log the error but continue with other papers
logger.warning(f"Skipping paper {paper_uid}: {str(e)}")
continue
if not papers:
raise PaperFormattingError(
f"No valid papers could be formatted from {result_count} search results. "
"Check database connectivity and paper IDs."
)
return papers
[docs]
def build_context_from_papers(papers: List[Dict[str, Any]]) -> str:
"""
Build a formatted context string from papers for RAG.
Parameters
----------
papers : list
List of paper dictionaries with at minimum: title/name, authors, abstract.
Returns
-------
str
Formatted context string for LLM consumption.
Raises
------
PaperFormattingError
If papers list is invalid or papers missing required fields.
Examples
--------
>>> papers = format_search_results(results, database)
>>> context = build_context_from_papers(papers)
>>> print(context)
"""
if not isinstance(papers, list):
raise PaperFormattingError("papers must be a list.")
if not papers:
raise PaperFormattingError("papers list is empty. Cannot build context.")
context_parts = []
for i, paper in enumerate(papers, 1):
if not isinstance(paper, dict):
raise PaperFormattingError(f"Paper {i} is not a dictionary.")
# Validate required fields
title = paper.get("title")
if not title:
raise PaperFormattingError(f"Paper {i} missing required 'title' field.")
authors = paper.get("authors")
if not authors:
logger.warning(f"Paper {i} ({title}) has no authors.")
authors = "N/A"
elif isinstance(authors, list):
authors = ", ".join(authors) if authors else "N/A"
abstract = paper.get("abstract", "")
if not abstract:
logger.warning(f"Paper {i} ({title}) has no abstract.")
abstract = "N/A"
# Build context for this paper
context_parts.append(f"Paper {i}:")
context_parts.append(f"Title: {title}")
context_parts.append(f"Authors: {authors}")
# Optional fields
if paper.get("topic"):
context_parts.append(f"Topic: {paper['topic']}")
if paper.get("decision"):
context_parts.append(f"Decision: {paper['decision']}")
context_parts.append(f"Abstract: {abstract}")
context_parts.append("") # Empty line between papers
return "\n".join(context_parts)
[docs]
def extract_top_keywords(papers: list, n_keywords: int = 5) -> list:
"""
Extract top keywords from a list of papers using TF-IDF.
Parameters
----------
papers : list
List of paper dicts, each with optional 'title' and 'abstract' keys.
n_keywords : int, optional
Number of top keywords to return (default: 5).
Returns
-------
list
List of keyword strings, ordered by relevance (highest TF-IDF first).
"""
from sklearn.feature_extraction.text import TfidfVectorizer
docs = []
for paper in papers:
title = paper.get("title", "") or ""
abstract = paper.get("abstract", "") or ""
text = f"{title}\n\n{abstract}".strip()
if text:
docs.append(text)
if not docs:
return []
try:
min_df = _TFIDF_MIN_DF_SMALL if len(docs) < _TFIDF_SMALL_CORPUS_THRESHOLD else _TFIDF_MIN_DF_REGULAR
tfidf = TfidfVectorizer(
max_features=_TFIDF_MAX_FEATURES,
min_df=min_df,
stop_words="english",
ngram_range=(2, 3),
)
tfidf_matrix = tfidf.fit_transform(docs)
feature_names = tfidf.get_feature_names_out()
mean_tfidf = tfidf_matrix.mean(axis=0).A1
top_indices = mean_tfidf.argsort()[-n_keywords:][::-1]
keywords = [feature_names[i] for i in top_indices if mean_tfidf[i] > 0]
return keywords[:n_keywords]
except Exception:
return []