"""
Database Models
===============
This module defines SQLAlchemy ORM models for the database tables.
These models support both SQLite and PostgreSQL backends.
"""
from datetime import datetime, timezone
from sqlalchemy import (
Column,
Float,
Integer,
String,
Text,
DateTime,
func,
)
from sqlalchemy.orm import DeclarativeBase
[docs]
class Base(DeclarativeBase):
"""Base class for all database models."""
pass
[docs]
class Paper(Base):
"""
Paper model representing a research paper.
This uses the lightweight schema from LightweightPaper model.
Attributes
----------
uid : str
Unique identifier (hash-based, primary key).
original_id : str, optional
Original ID from the source (e.g., OpenReview ID).
title : str
Paper title.
authors : str, optional
Semicolon-separated list of author names.
abstract : str, optional
Paper abstract.
session : str, optional
Conference session name.
poster_position : str, optional
Poster position identifier.
paper_pdf_url : str, optional
URL to paper PDF.
poster_image_url : str, optional
URL to poster image.
url : str, optional
General URL for the paper.
room_name : str, optional
Room name for presentation.
keywords : str, optional
Comma-separated keywords.
starttime : str, optional
Start time of presentation.
endtime : str, optional
End time of presentation.
award : str, optional
Award received (e.g., "Best Paper").
year : int, optional
Publication year.
conference : str, optional
Conference name (e.g., "NeurIPS", "ICLR").
created_at : datetime
Timestamp when record was created.
"""
__tablename__ = "papers"
uid = Column(String(16), primary_key=True, index=True)
original_id = Column(String, nullable=True, index=True)
title = Column(Text, nullable=False, index=True)
authors = Column(Text, nullable=True)
abstract = Column(Text, nullable=True)
session = Column(String, nullable=True, index=True)
poster_position = Column(String, nullable=True)
paper_pdf_url = Column(String, nullable=True)
poster_image_url = Column(String, nullable=True)
url = Column(String, nullable=True)
room_name = Column(String, nullable=True)
keywords = Column(Text, nullable=True)
starttime = Column(String, nullable=True)
endtime = Column(String, nullable=True)
award = Column(String, nullable=True)
year = Column(Integer, nullable=True, index=True)
conference = Column(String, nullable=True, index=True)
created_at = Column(
DateTime(timezone=True), nullable=False, default=lambda: datetime.now(timezone.utc), server_default=func.now()
)
[docs]
def __repr__(self) -> str:
"""String representation of Paper."""
return f"<Paper(uid='{self.uid}', title='{self.title[:50]}...')>"
[docs]
class ClusteringCache(Base):
"""
Clustering cache model.
Stores cached clustering results including visualization coordinates.
When only the dimensionality reduction method changes, the clustering
results (assignments, labels, hierarchy) are reused and only the reduction
is re-applied, avoiding expensive re-clustering.
Attributes
----------
id : int
Auto-incrementing primary key.
embedding_model : str
Name of the embedding model used.
conference : str, optional
Conference name this cache entry is scoped to (e.g., 'NeurIPS').
year : int, optional
Conference year this cache entry is scoped to.
reduction_method : str
Dimensionality reduction method used (e.g., 'pca', 'tsne').
n_components : int
Number of dimensions after reduction.
clustering_method : str
Clustering algorithm used (e.g., 'kmeans', 'dbscan').
n_clusters : int, optional
Actual number of clusters in the cached results.
clustering_params : str
JSON string of additional clustering parameters.
results_json : str
JSON string containing full clustering results including points
with visualization coordinates.
created_at : datetime
Timestamp when cache was created.
"""
__tablename__ = "clustering_cache"
id = Column(Integer, primary_key=True, autoincrement=True)
embedding_model = Column(String, nullable=False, index=True)
conference = Column(String, nullable=True, index=True)
year = Column(Integer, nullable=True, index=True)
reduction_method = Column(String, nullable=False)
n_components = Column(Integer, nullable=False)
clustering_method = Column(String, nullable=False, index=True)
n_clusters = Column(Integer, nullable=True)
clustering_params = Column(Text, nullable=True)
results_json = Column(Text, nullable=False)
created_at = Column(
DateTime(timezone=True), nullable=False, default=lambda: datetime.now(timezone.utc), server_default=func.now()
)
[docs]
def __repr__(self) -> str:
"""String representation of ClusteringCache."""
return (
f"<ClusteringCache(id={self.id}, conference='{self.conference}', "
f"year={self.year}, method='{self.clustering_method}', n_clusters={self.n_clusters})>"
)
[docs]
class HierarchicalLabelCache(Base):
"""
Hierarchical label cache model.
Stores cached hierarchical cluster labels for agglomerative clustering.
Labels are independent of the number of clusters or distance threshold and
are reused for all agglomerative clustering settings that share the same
embedding model and linkage method.
Attributes
----------
id : int
Auto-incrementing primary key.
embedding_model : str
Name of the embedding model used.
linkage : str
Linkage method used in agglomerative clustering (e.g., 'ward').
labels_json : str
JSON string mapping node IDs to their generated labels.
created_at : datetime
Timestamp when cache was created.
"""
__tablename__ = "hierarchical_label_cache"
id = Column(Integer, primary_key=True, autoincrement=True)
embedding_model = Column(String, nullable=False, index=True)
linkage = Column(String, nullable=False, default="ward")
labels_json = Column(Text, nullable=False)
created_at = Column(
DateTime(timezone=True),
nullable=False,
default=lambda: datetime.now(timezone.utc),
server_default=func.now(),
)
[docs]
def __repr__(self) -> str:
"""String representation of HierarchicalLabelCache."""
return f"<HierarchicalLabelCache(id={self.id}, model='{self.embedding_model}', linkage='{self.linkage}')>"
[docs]
class ValidationData(Base):
"""
Validation data model.
Stores anonymized user-donated data about interesting papers
for validation and service improvement purposes.
Attributes
----------
id : int
Auto-incrementing primary key.
paper_uid : str
Paper UID reference (anonymized - no direct user identification).
priority : int
User-assigned priority/rating (1-5).
search_term : str, optional
Search term or context associated with this paper.
donated_at : datetime
Timestamp when data was donated.
"""
__tablename__ = "validation_data"
id = Column(Integer, primary_key=True, autoincrement=True)
paper_uid = Column(String(16), nullable=False, index=True)
priority = Column(Integer, nullable=False)
search_term = Column(String, nullable=True)
donated_at = Column(
DateTime(timezone=True), nullable=False, default=lambda: datetime.now(timezone.utc), server_default=func.now()
)
[docs]
def __repr__(self) -> str:
"""String representation of ValidationData."""
return f"<ValidationData(id={self.id}, paper_uid='{self.paper_uid}', priority={self.priority})>"
[docs]
class ChatDonation(Base):
"""
Chat donation model.
Stores anonymized user-donated chat transcripts with thumbs up/down
feedback for improving the chat system.
Attributes
----------
id : int
Auto-incrementing primary key.
rating : str
User feedback rating ('up' or 'down').
transcript : str
JSON string containing the chat transcript (list of messages).
donated_at : datetime
Timestamp when data was donated.
"""
__tablename__ = "chat_donations"
id = Column(Integer, primary_key=True, autoincrement=True)
rating = Column(String, nullable=False)
transcript = Column(Text, nullable=False)
donated_at = Column(
DateTime(timezone=True), nullable=False, default=lambda: datetime.now(timezone.utc), server_default=func.now()
)
[docs]
def __repr__(self) -> str:
"""String representation of ChatDonation."""
return f"<ChatDonation(id={self.id}, rating='{self.rating}')>"
[docs]
class EvalQAPair(Base):
"""
Evaluation query/answer pair.
Stores queries and their expected answers for automatic evaluation of the
RAG system. Supports multi-turn conversations via ``conversation_id`` and
``turn_number``.
Attributes
----------
id : int
Auto-incrementing primary key.
conversation_id : str
Groups related queries in a conversation. All turns in the same
conversation share this ID.
turn_number : int
Position within the conversation (0 = initial query, 1+ = follow-ups).
query : str
The user query text.
expected_answer : str
The expected/reference answer.
tool_name : str, optional
The MCP tool expected to be invoked for this query.
verified : int
Verification status: 0 = unverified, 1 = verified/approved,
-1 = rejected/deleted.
source_info : str, optional
JSON string with metadata about how the pair was generated
(e.g. paper UIDs used, generation model).
created_at : datetime
Timestamp when the pair was created.
updated_at : datetime
Timestamp when the pair was last modified.
"""
__tablename__ = "eval_qa_pairs"
id = Column(Integer, primary_key=True, autoincrement=True)
conversation_id = Column(String, nullable=False, index=True)
turn_number = Column(Integer, nullable=False, default=0)
query = Column(Text, nullable=False)
expected_answer = Column(Text, nullable=False)
tool_name = Column(String, nullable=True, index=True)
verified = Column(Integer, nullable=False, default=0)
source_info = Column(Text, nullable=True)
created_at = Column(
DateTime(timezone=True),
nullable=False,
default=lambda: datetime.now(timezone.utc),
server_default=func.now(),
)
updated_at = Column(
DateTime(timezone=True),
nullable=False,
default=lambda: datetime.now(timezone.utc),
onupdate=lambda: datetime.now(timezone.utc),
server_default=func.now(),
)
[docs]
def __repr__(self) -> str:
"""String representation of EvalQAPair."""
return (
f"<EvalQAPair(id={self.id}, conv='{self.conversation_id}', "
f"turn={self.turn_number}, tool='{self.tool_name}')>"
)
[docs]
class EvalResult(Base):
"""
Evaluation run result for a single Q/A pair.
Stores the actual output from the RAG system when evaluated against a
stored :class:`EvalQAPair`, together with scoring metrics.
Attributes
----------
id : int
Auto-incrementing primary key.
run_id : str
Identifier grouping results from the same evaluation run.
qa_pair_id : int
ID of the :class:`EvalQAPair` that was evaluated.
actual_answer : str, optional
The answer produced by the RAG system.
actual_tool_name : str, optional
The MCP tool actually invoked by the RAG system.
answer_score : float, optional
LLM-judged quality score (1–5 scale).
tool_correct : int, optional
Whether the correct tool was used (1 = yes, 0 = no).
latency_ms : int, optional
Wall-clock time for the query in milliseconds.
error : str, optional
Error message if the query failed.
judge_reasoning : str, optional
The LLM judge's reasoning for the assigned score.
created_at : datetime
Timestamp when the result was recorded.
"""
__tablename__ = "eval_results"
id = Column(Integer, primary_key=True, autoincrement=True)
run_id = Column(String, nullable=False, index=True)
qa_pair_id = Column(Integer, nullable=False, index=True)
actual_answer = Column(Text, nullable=True)
actual_tool_name = Column(String, nullable=True)
answer_score = Column(Float, nullable=True)
tool_correct = Column(Integer, nullable=True)
latency_ms = Column(Integer, nullable=True)
error = Column(Text, nullable=True)
judge_reasoning = Column(Text, nullable=True)
created_at = Column(
DateTime(timezone=True),
nullable=False,
default=lambda: datetime.now(timezone.utc),
server_default=func.now(),
)
[docs]
def __repr__(self) -> str:
"""String representation of EvalResult."""
return (
f"<EvalResult(id={self.id}, run='{self.run_id}', "
f"qa_pair={self.qa_pair_id}, score={self.answer_score})>"
)