From e80636fc0ef9a33d5f5584fcdfea44b19291a3d0 Mon Sep 17 00:00:00 2001 From: Yijia-Xiao Date: Sun, 31 May 2026 01:45:25 +0000 Subject: [PATCH] feat(sentiment): structured output for the Sentiment Analyst The analyst emitted free-form prose, so its sentiment header varied by provider and run and downstream consumers needed drifting regex. Extend the structured-output pattern the trio already uses: a SentimentReport schema (band + 0-10 score + confidence + narrative) rendered to a deterministic header, with a free-text fallback for providers that lack native structured output. #796 --- tests/test_structured_agents.py | 134 +++++++++++++++++- .../agents/analysts/sentiment_analyst.py | 55 ++++--- tradingagents/agents/schemas.py | 91 +++++++++++- 3 files changed, 259 insertions(+), 21 deletions(-) diff --git a/tests/test_structured_agents.py b/tests/test_structured_agents.py index ea771a4b0..5927f2d11 100644 --- a/tests/test_structured_agents.py +++ b/tests/test_structured_agents.py @@ -1,23 +1,28 @@ -"""Tests for structured-output agents (Trader and Research Manager). +"""Tests for structured-output agents (Trader, Research Manager, Sentiment Analyst). The Portfolio Manager has its own coverage in tests/test_memory_log.py (which exercises the full memory-log → PM injection cycle). This file covers the parallel schemas, render functions, and graceful-fallback -behavior we added for the Trader and Research Manager so all three -decision-making agents share the same shape. +behavior we added for the Trader, Research Manager, and Sentiment Analyst +so they share the same deterministic output shape. """ from unittest.mock import MagicMock import pytest +from pydantic import ValidationError +from tradingagents.agents.analysts.sentiment_analyst import create_sentiment_analyst from tradingagents.agents.managers.research_manager import create_research_manager from tradingagents.agents.schemas import ( PortfolioRating, ResearchPlan, + SentimentBand, + SentimentReport, TraderAction, TraderProposal, render_research_plan, + render_sentiment_report, render_trader_proposal, ) from tradingagents.agents.trader.trader import create_trader @@ -230,3 +235,126 @@ class TestResearchManagerAgent: rm = create_research_manager(llm) result = rm(_make_rm_state()) assert result["investment_plan"] == plain_response + + +# --------------------------------------------------------------------------- +# Sentiment Analyst: schema, render, structured happy path + fallback +# --------------------------------------------------------------------------- + + +@pytest.mark.unit +class TestRenderSentimentReport: + def test_header_contains_band_and_score(self): + report = SentimentReport( + overall_band=SentimentBand.BULLISH, + overall_score=7.2, + confidence="high", + narrative="Source breakdown here.", + ) + md = render_sentiment_report(report) + assert "**Overall Sentiment:** **Bullish**" in md + assert "(Score: 7.2/10)" in md + + def test_header_contains_confidence(self): + report = SentimentReport( + overall_band=SentimentBand.NEUTRAL, + overall_score=5.0, + confidence="low", + narrative="Limited data.", + ) + assert "**Confidence:** Low" in render_sentiment_report(report) + + def test_narrative_preserved_in_output(self): + narrative = "## Breakdown\n\nStockTwits: 70% bullish.\n\n| Signal | Direction |\n|---|---|\n| News | Neutral |" + report = SentimentReport( + overall_band=SentimentBand.MILDLY_BULLISH, + overall_score=6.0, + confidence="medium", + narrative=narrative, + ) + assert narrative in render_sentiment_report(report) + + def test_all_six_bands_render(self): + for band in SentimentBand: + report = SentimentReport( + overall_band=band, overall_score=5.0, + confidence="medium", narrative="n", + ) + assert band.value in render_sentiment_report(report) + + def test_score_out_of_range_rejected(self): + with pytest.raises(ValidationError): + SentimentReport( + overall_band=SentimentBand.BULLISH, overall_score=11.0, + confidence="high", narrative="n", + ) + + +def _make_sentiment_state(): + return { + "company_of_interest": "NVDA", + "trade_date": "2026-01-15", + "asset_type": "stock", + "messages": [], + } + + +def _structured_sentiment_llm(captured: dict, report: SentimentReport | None = None): + """MagicMock LLM whose structured binding captures the prompt and returns + a real SentimentReport so render_sentiment_report works.""" + if report is None: + report = SentimentReport( + overall_band=SentimentBand.BULLISH, overall_score=7.5, + confidence="high", + narrative="StockTwits 75% bullish. News constructive. Reddit upbeat.", + ) + structured = MagicMock() + structured.invoke.side_effect = lambda prompt: ( + captured.__setitem__("prompt", prompt) or report + ) + llm = MagicMock() + llm.with_structured_output.return_value = structured + return llm + + +@pytest.mark.unit +class TestSentimentAnalystAgent: + def test_structured_path_produces_rendered_markdown(self): + captured = {} + report = SentimentReport( + overall_band=SentimentBand.MILDLY_BEARISH, overall_score=4.0, + confidence="medium", narrative="Mixed signals across sources.", + ) + analyst = create_sentiment_analyst(_structured_sentiment_llm(captured, report)) + sr = analyst(_make_sentiment_state())["sentiment_report"] + assert "**Overall Sentiment:** **Mildly Bearish**" in sr + assert "(Score: 4.0/10)" in sr + assert "Mixed signals across sources." in sr + + def test_sentiment_report_also_in_messages(self): + captured = {} + analyst = create_sentiment_analyst(_structured_sentiment_llm(captured)) + result = analyst(_make_sentiment_state()) + assert len(result["messages"]) == 1 + assert result["sentiment_report"] == result["messages"][0].content + + def test_prompt_contains_ticker(self): + captured = {} + create_sentiment_analyst(_structured_sentiment_llm(captured))(_make_sentiment_state()) + assert any("NVDA" in str(m) for m in captured["prompt"]) + + def test_falls_back_to_freetext_when_structured_unavailable(self): + plain = "**Overall Sentiment:** **Bearish** (Score: 3.0/10)\n**Confidence:** Low\n\nLimited data." + llm = MagicMock() + llm.with_structured_output.side_effect = NotImplementedError("provider unsupported") + llm.invoke.return_value = MagicMock(content=plain) + assert create_sentiment_analyst(llm)(_make_sentiment_state())["sentiment_report"] == plain + + def test_falls_back_to_freetext_when_structured_call_fails(self): + plain = "Fallback free-text sentiment." + structured = MagicMock() + structured.invoke.side_effect = ValueError("bad JSON from model") + llm = MagicMock() + llm.with_structured_output.return_value = structured + llm.invoke.return_value = MagicMock(content=plain) + assert create_sentiment_analyst(llm)(_make_sentiment_state())["sentiment_report"] == plain diff --git a/tradingagents/agents/analysts/sentiment_analyst.py b/tradingagents/agents/analysts/sentiment_analyst.py index ad9c5d72c..c79741507 100644 --- a/tradingagents/agents/analysts/sentiment_analyst.py +++ b/tradingagents/agents/analysts/sentiment_analyst.py @@ -14,19 +14,31 @@ the LLM is invoked and injects them into the prompt as structured blocks: 3. Reddit posts — r/wallstreetbets, r/stocks, r/investing The agent does not use tool-calling; the data is in the prompt from -turn 0. The LLM produces the sentiment report in a single invocation. +turn 0. Output uses the structured-output pattern (json_schema for +OpenAI/xAI, response_schema for Gemini, tool-use for Anthropic), falling +back to free-text generation for providers that lack native support, so +the sentiment header (band + score + confidence) is deterministic across +runs and providers instead of free-form per-model prose. See: https://github.com/TauricResearch/TradingAgents/issues/557 +See: https://github.com/TauricResearch/TradingAgents/issues/796 """ from datetime import datetime, timedelta +from langchain_core.messages import AIMessage from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder + +from tradingagents.agents.schemas import SentimentReport, render_sentiment_report from tradingagents.agents.utils.agent_utils import ( get_instrument_context_from_state, get_language_instruction, get_news, ) +from tradingagents.agents.utils.structured import ( + bind_structured, + invoke_structured_or_freetext, +) from tradingagents.dataflows.reddit import fetch_reddit_posts from tradingagents.dataflows.stocktwits import fetch_stocktwits_messages @@ -39,9 +51,11 @@ def create_sentiment_analyst(llm): """Create a sentiment analyst node for the trading graph. Pre-fetches news + StockTwits + Reddit data, injects them into the - prompt as structured blocks, and produces a sentiment report in a - single LLM call. + prompt as structured blocks, and produces a deterministic sentiment + report via structured output (with a free-text fallback for providers + that do not support it). """ + structured_llm = bind_structured(llm, SentimentReport, "Sentiment Analyst") def sentiment_analyst_node(state): ticker = state["company_of_interest"] @@ -83,14 +97,22 @@ def create_sentiment_analyst(llm): prompt = prompt.partial(current_date=end_date) prompt = prompt.partial(instrument_context=instrument_context) - # No bind_tools — the data is already in the prompt; a single LLM - # call produces the report directly. - chain = prompt | llm - result = chain.invoke(state["messages"]) + # Format the template into a concrete message list so the structured + # and free-text paths receive the same input. No bind_tools — the + # data is already in the prompt. + formatted_messages = prompt.format_messages(messages=state["messages"]) + + report_text = invoke_structured_or_freetext( + structured_llm, + llm, + formatted_messages, + render_sentiment_report, + "Sentiment Analyst", + ) return { - "messages": [result], - "sentiment_report": result.content, + "messages": [AIMessage(content=report_text)], + "sentiment_report": report_text, } return sentiment_analyst_node @@ -143,21 +165,20 @@ Community discussion. Engagement signal via upvote score and comment count. Subr 5. **Identify recurring narrative themes.** What topic keeps coming up across sources? That's the dominant narrative driving current sentiment. -6. **Be honest about data limits.** If StockTwits returned only a handful of messages, or one or more sources returned an "" placeholder, the sentiment read is less robust — flag this caveat explicitly. If the sources are silent on a given subreddit, say so. +6. **Be honest about data limits.** If StockTwits returned only a handful of messages, or one or more sources returned an "" placeholder, the sentiment read is less robust — flag this explicitly in the `confidence` field and the narrative. If the sources are silent on a given subreddit, say so. 7. **Identify catalysts and risks** that emerge across sources — news of upcoming earnings, product launches, competitive threats, macro headlines, etc. 8. **Past sentiment is not predictive.** Frame your conclusions as signal for the trader to weigh alongside fundamentals and technicals, not as a price call. -## Output +## Output fields -Produce a sentiment report covering, in order: +Fill the following fields: -1. **Overall sentiment direction** — Bullish / Bearish / Neutral / Mixed — with a brief confidence note based on data quality and sample size. -2. **Source-by-source breakdown** — what each of news / StockTwits / Reddit is telling you, with specific evidence (cite message counts, ratios, notable posts). -3. **Divergences, alignments, and key narratives** across sources. -4. **Catalysts and risks** surfaced by the data. -5. **Markdown table** at the end summarizing key sentiment signals, their direction, source, and supporting evidence. +- **overall_band**: Exactly one of Bullish / Mildly Bullish / Neutral / Mixed / Mildly Bearish / Bearish. Use Mixed when sources point in clearly different directions; Neutral only when all sources are genuinely silent. +- **overall_score**: A number from 0 (maximally bearish) to 10 (maximally bullish); 5 is neutral. Keep it consistent with overall_band. +- **confidence**: low / medium / high, based on data quality and sample size. +- **narrative**: Full source-by-source breakdown, divergences, dominant narrative themes, catalysts and risks, and a markdown summary table of key sentiment signals (direction, source, supporting evidence). {get_language_instruction()}""" diff --git a/tradingagents/agents/schemas.py b/tradingagents/agents/schemas.py index 55f0e3cfb..f89878c41 100644 --- a/tradingagents/agents/schemas.py +++ b/tradingagents/agents/schemas.py @@ -19,7 +19,7 @@ so that: from __future__ import annotations from enum import Enum -from typing import Optional +from typing import Literal, Optional from pydantic import BaseModel, Field @@ -226,3 +226,92 @@ def render_pm_decision(decision: PortfolioDecision) -> str: if decision.time_horizon: parts.extend(["", f"**Time Horizon**: {decision.time_horizon}"]) return "\n".join(parts) + + +# --------------------------------------------------------------------------- +# Sentiment Analyst +# --------------------------------------------------------------------------- + + +class SentimentBand(str, Enum): + """Discrete sentiment direction produced by the Sentiment Analyst. + + Six tiers keep the signal granular enough to be actionable while remaining + small enough for every provider to map reliably from its JSON output. + """ + + BULLISH = "Bullish" + MILDLY_BULLISH = "Mildly Bullish" + NEUTRAL = "Neutral" + MIXED = "Mixed" + MILDLY_BEARISH = "Mildly Bearish" + BEARISH = "Bearish" + + +class SentimentReport(BaseModel): + """Structured sentiment report produced by the Sentiment Analyst. + + Replaces the previous free-form prose output so downstream consumers + (dashboards, audit logs, PDF renderers, other agents) can read + ``overall_band`` and ``overall_score`` without maintaining fragile regex + fallbacks that drift with every model release. ``narrative`` preserves the + rich source-by-source analysis; ``render_sentiment_report`` prepends a + deterministic header so the saved report stays human-readable. + """ + + overall_band: SentimentBand = Field( + description=( + "Overall sentiment direction. Exactly one of: " + "Bullish / Mildly Bullish / Neutral / Mixed / Mildly Bearish / Bearish. " + "Use Mixed when sources point in clearly different directions. " + "Use Neutral only when all sources are genuinely silent or non-committal." + ), + ) + overall_score: float = Field( + ge=0.0, + le=10.0, + description=( + "Numeric sentiment intensity on a 0–10 scale. " + "0 = maximally bearish, 5 = neutral, 10 = maximally bullish. " + "Guideline for consistency with overall_band: " + "Bullish ~6.5–10, Mildly Bullish ~5.5–6.4, Neutral/Mixed ~4.5–5.5, " + "Mildly Bearish ~3.5–4.4, Bearish ~0–3.4. " + "Only the 0–10 bounds are enforced." + ), + ) + confidence: Literal["low", "medium", "high"] = Field( + description=( + "Confidence in the assessment based on data quality and sample size. " + "Use 'low' when one or more sources returned a placeholder or fewer " + "than 5 data points; 'medium' when data is present but sparse; " + "'high' when all three sources returned substantive data." + ), + ) + narrative: str = Field( + description=( + "Full sentiment report covering, in order: " + "(1) source-by-source breakdown with specific evidence (cite message " + "counts, ratios, notable posts); " + "(2) cross-source divergences and alignments; " + "(3) dominant narrative themes; " + "(4) catalysts and risks surfaced by the data; " + "(5) a markdown table summarising key sentiment signals, their " + "direction, source, and supporting evidence." + ), + ) + + +def render_sentiment_report(report: SentimentReport) -> str: + """Render a SentimentReport to the markdown shape the rest of the system expects. + + The structured header (band + score + confidence) is prepended to the + narrative so the saved report is both human-readable and machine-parseable + without regex. + """ + return "\n".join([ + f"**Overall Sentiment:** **{report.overall_band.value}** " + f"(Score: {report.overall_score:.1f}/10)", + f"**Confidence:** {report.confidence.capitalize()}", + "", + report.narrative, + ])