Source code for abstracts_explorer.config

"""
Configuration management for neurips-abstracts package.

This module loads configuration from environment variables and .env files.
Uses only standard library (no python-dotenv dependency required).
"""

import os
from pathlib import Path
from typing import Optional, Dict, Any


[docs] def load_env_file(env_path: Optional[Path] = None) -> Dict[str, str]: """ Load environment variables from a .env file. Uses a simple parser that handles basic .env file format without requiring external dependencies. Parameters ---------- env_path : Path, optional Path to .env file. If None, looks for .env in current directory and parent directories up to the package root. Returns ------- dict Dictionary of environment variables loaded from file. Examples -------- >>> env_vars = load_env_file(Path(".env")) >>> print(env_vars.get("CHAT_MODEL")) """ if env_path is None: # Look for .env file starting from current directory current = Path.cwd() for _ in range(5): # Check up to 5 parent directories env_file = current / ".env" if env_file.exists(): env_path = env_file break current = current.parent if env_path is None or not env_path.exists(): return {} env_vars = {} try: with open(env_path, "r") as f: for line in f: line = line.strip() # Skip empty lines and comments if not line or line.startswith("#"): continue # Parse KEY=VALUE format if "=" in line: key, value = line.split("=", 1) key = key.strip() value = value.strip() # Remove quotes if present if value and value[0] in ('"', "'") and value[-1] == value[0]: value = value[1:-1] env_vars[key] = value except Exception: # Silently ignore errors reading .env file pass return env_vars
[docs] class Config: """ Configuration manager for neurips-abstracts package. Loads configuration from environment variables with fallback to defaults. Automatically loads from .env file if present. Attributes ---------- data_dir : str Base directory for data files (databases, embeddings). chat_model : str Name of the language model for chat/RAG. embedding_model : str Name of the embedding model. llm_backend_url : str URL for OpenAI-compatible API endpoint. llm_backend_auth_token : str Authentication token for LLM backend (if required). embedding_db : str ChromaDB configuration - can be either a URL (e.g., "http://chromadb:8000") or a file path (e.g., "chroma_db" or "/path/to/chroma_db"). paper_db_path : str Path to SQLite paper database. collection_name : str Name of the ChromaDB collection. max_context_papers : int Default number of papers for RAG context. chat_temperature : float Default temperature for chat generation. chat_max_tokens : int Default max tokens for chat responses. enable_query_rewriting : bool Whether to enable query rewriting for better semantic search. query_similarity_threshold : float Similarity threshold for determining when to retrieve new papers (0.0-1.0). database_url : str SQLAlchemy database URL (supports SQLite, PostgreSQL, etc.). Automatically constructed from PAPER_DB config variable. log_level : str Logging level from environment (WARNING, INFO, DEBUG). Empty string if not set. Used by setup_logging() to set the default log level when verbosity flags are not used. Examples -------- >>> config = Config() >>> print(config.chat_model) 'diffbot-small-xl-2508' >>> config.llm_backend_url 'http://localhost:1234' >>> # Using DATABASE_URL for PostgreSQL >>> config.database_url 'postgresql://user:password@localhost/abstracts' """
[docs] def __init__(self, env_path: Optional[Path] = None): """ Initialize configuration. Parameters ---------- env_path : Path, optional Path to .env file. If None, searches for .env automatically. """ # Load .env file if it exists env_vars = load_env_file(env_path) # Merge with actual environment variables (environment variables take precedence) self._env = {**env_vars, **os.environ} # Load all configuration values self._load_config()
def _load_config(self): """Load configuration from environment variables.""" # Data Directory (base directory for all data files) self.data_dir = self._get_env("DATA_DIR", default="data") # Chat/Language Model Settings self.chat_model = self._get_env("CHAT_MODEL", default="diffbot-small-xl-2508") self.chat_temperature = self._get_env_float("CHAT_TEMPERATURE", default=0.7) self.chat_max_tokens = self._get_env_int("CHAT_MAX_TOKENS", default=1000) # Embedding Model Settings self.embedding_model = self._get_env("EMBEDDING_MODEL", default="text-embedding-qwen3-embedding-4b") # LLM Backend Configuration self.llm_backend_url = self._get_env("LLM_BACKEND_URL", default="http://localhost:1234") self.llm_backend_auth_token = self._get_env("LLM_BACKEND_AUTH_TOKEN", default="") # Database Configuration # PAPER_DB can be either: # 1. A PostgreSQL URL (e.g., "postgresql://user:pass@host/db") # 2. A file path for SQLite (e.g., "abstracts.db" or "/path/to/abstracts.db") paper_db = self._get_env("PAPER_DB", default="abstracts.db") if paper_db.startswith("postgresql://") or paper_db.startswith("sqlite://"): # Full database URL provided self.database_url = paper_db elif "://" in paper_db: # Other database URL (mysql, etc.) self.database_url = paper_db else: # File path - treat as SQLite paper_db_path = self._resolve_path(paper_db) self.database_url = f"sqlite:///{paper_db_path}" # Embedding database configuration # EMBEDDING_DB can be either a URL (e.g., "http://chromadb:8000") # or a file path (e.g., "chroma_db" or "/path/to/chroma_db") embedding_db = self._get_env("EMBEDDING_DB", default="chroma_db") if embedding_db.startswith("http://") or embedding_db.startswith("https://"): # URL provided - use as-is self.embedding_db = embedding_db else: # File path - resolve relative to data_dir self.embedding_db = self._resolve_path(embedding_db) # Collection Settings self.collection_name = self._get_env("COLLECTION_NAME", default="papers") # RAG Settings self.max_context_papers = self._get_env_int("MAX_CONTEXT_PAPERS", default=5) # Query Rewriting Settings self.enable_query_rewriting = self._get_env_bool("ENABLE_QUERY_REWRITING", default=True) self.query_similarity_threshold = self._get_env_float("QUERY_SIMILARITY_THRESHOLD", default=0.7) # Logging Configuration self.log_level = self._get_env("LOG_LEVEL", default="").upper() def _get_env(self, key: str, default: str = "") -> str: """ Get string environment variable. Parameters ---------- key : str Environment variable name. default : str Default value if not set. Returns ------- str Environment variable value or default. """ return self._env.get(key, default) def _get_env_int(self, key: str, default: int = 0) -> int: """ Get integer environment variable. Parameters ---------- key : str Environment variable name. default : int Default value if not set or invalid. Returns ------- int Environment variable value as integer or default. """ value = self._env.get(key, "") try: return int(value) except (ValueError, TypeError): return default def _get_env_float(self, key: str, default: float = 0.0) -> float: """ Get float environment variable. Parameters ---------- key : str Environment variable name. default : float Default value if not set or invalid. Returns ------- float Environment variable value as float or default. """ value = self._env.get(key, "") try: return float(value) except (ValueError, TypeError): return default def _get_env_bool(self, key: str, default: bool = False) -> bool: """ Get boolean environment variable. Parameters ---------- key : str Environment variable name. default : bool Default value if not set or invalid. Returns ------- bool Environment variable value as boolean or default. """ value = self._env.get(key, "").lower() if value in ("true", "1", "yes", "on"): return True elif value in ("false", "0", "no", "off"): return False return default def _resolve_path(self, path: str) -> str: """ Resolve a path relative to the data directory. If the path is absolute, returns it unchanged. Otherwise, resolves it relative to data_dir. Parameters ---------- path : str Path to resolve. Returns ------- str Resolved path (relative to data_dir if not absolute). """ path_obj = Path(path) if path_obj.is_absolute(): # Return absolute path as string (expanduser to handle ~) return str(path_obj.expanduser().absolute()) # Resolve relative to data_dir and make absolute return str((Path(self.data_dir) / path).absolute())
[docs] def to_dict(self) -> Dict[str, Any]: """ Convert configuration to dictionary. Returns ------- dict Dictionary of all configuration values. Examples -------- >>> config = Config() >>> config_dict = config.to_dict() >>> print(config_dict["chat_model"]) """ return { "data_dir": self.data_dir, "chat_model": self.chat_model, "chat_temperature": self.chat_temperature, "chat_max_tokens": self.chat_max_tokens, "embedding_model": self.embedding_model, "llm_backend_url": self.llm_backend_url, "llm_backend_auth_token": "***" if self.llm_backend_auth_token else "", "embedding_db": self.embedding_db, "database_url": self._mask_database_url(self.database_url), "collection_name": self.collection_name, "max_context_papers": self.max_context_papers, }
def _mask_database_url(self, url: str) -> str: """ Mask password in database URL for display. Parameters ---------- url : str Database URL that may contain password. Returns ------- str URL with password masked. """ if not url or "://" not in url: return url # For URLs with password (e.g., postgresql://user:password@host/db) # Mask the password part if "@" in url and ":" in url.split("://")[1].split("@")[0]: parts = url.split("://") protocol = parts[0] rest = parts[1] # Split at @ to separate credentials from host creds_and_host = rest.split("@") if len(creds_and_host) == 2: creds = creds_and_host[0] host = creds_and_host[1] # Mask password in credentials if ":" in creds: user = creds.split(":")[0] return f"{protocol}://{user}:***@{host}" return url
[docs] def __repr__(self) -> str: """String representation of configuration.""" items = [] for key, value in self.to_dict().items(): items.append(f"{key}={value}") return f"Config({', '.join(items)})"
# Global configuration instance _config: Optional[Config] = None
[docs] def get_config(reload: bool = False, env_path: Optional[Path] = None) -> Config: """ Get global configuration instance. Parameters ---------- reload : bool, optional Force reload configuration from environment, by default False env_path : Path, optional Path to .env file. If provided, loads configuration from this file. Useful for testing to ensure consistent configuration. Returns ------- Config Global configuration instance. Examples -------- >>> config = get_config() >>> print(config.chat_model) >>> # In tests, use .env.tests for consistent values >>> config = get_config(reload=True, env_path=Path(".env.tests")) """ global _config if _config is None or reload: _config = Config(env_path=env_path) return _config