From 9fd54f83682284fed82ec54e3719a12fd440a0f9 Mon Sep 17 00:00:00 2001 From: Yijia-Xiao Date: Sun, 14 Jun 2026 07:10:15 +0000 Subject: [PATCH] fix(data): reject stale yfinance OHLCV instead of reporting wrong prices yfinance intermittently returns a year-old partial frame (e.g. June 2025 rows for a June 2026 request) that still has rows and a Close, so it passed the empty-check and silently fed a wrong close price and indicators into the report (#1021). Add a freshness guard that rejects a frame whose latest row is far older than the requested date, on both the raw OHLCV path and the indicator path. It raises the existing NoMarketDataError with a stale-specific detail, so the vendor router's try-next-vendor and single unavailable-signal handling apply unchanged; the sentinel now surfaces that detail so the agent reports the specific reason rather than fabricating a value. --- tests/test_yfinance_stale_ohlcv_guard.py | 113 ++++++++++++++++++++ tradingagents/dataflows/stockstats_utils.py | 74 ++++++++++++- tradingagents/dataflows/y_finance.py | 47 +++++--- 3 files changed, 212 insertions(+), 22 deletions(-) create mode 100644 tests/test_yfinance_stale_ohlcv_guard.py diff --git a/tests/test_yfinance_stale_ohlcv_guard.py b/tests/test_yfinance_stale_ohlcv_guard.py new file mode 100644 index 000000000..4ab425f2d --- /dev/null +++ b/tests/test_yfinance_stale_ohlcv_guard.py @@ -0,0 +1,113 @@ +"""Stale OHLCV guard (#1021): a vendor returning a year-old partial frame must +be rejected, not fed into the report as if it were current. + +The guard raises NoMarketDataError with a stale-specific detail, so the router's +existing try-next-vendor + single-sentinel handling applies and the sentinel +surfaces the reason. +""" +import copy +import unittest +from unittest import mock + +import pandas as pd +import pytest + +import tradingagents.dataflows.config as config_module +import tradingagents.dataflows.y_finance as y_finance +import tradingagents.default_config as default_config +from tradingagents.dataflows import interface +from tradingagents.dataflows.config import set_config +from tradingagents.dataflows.stockstats_utils import _assert_ohlcv_not_stale +from tradingagents.dataflows.symbol_utils import NoMarketDataError + + +def _frame(date): + return pd.DataFrame( + { + "Date": [pd.Timestamp(date)], + "Open": [330.0], + "High": [332.0], + "Low": [328.0], + "Close": [330.58], + "Volume": [1_000_000], + } + ) + + +@pytest.mark.unit +class StaleGuardUnitTests(unittest.TestCase): + def test_recent_prior_trading_day_is_accepted(self): + # 1 day before curr_date — well within the freshness window. + _assert_ohlcv_not_stale(_frame("2026-06-10"), "2026-06-11", "CB") + + def test_year_old_row_is_rejected_with_detail(self): + with self.assertRaises(NoMarketDataError) as ctx: + _assert_ohlcv_not_stale(_frame("2025-06-11"), "2026-06-11", "CB", "CB") + msg = str(ctx.exception) + self.assertIn("2025-06-11", msg) + self.assertIn("2026-06-11", msg) + self.assertIn("stale", msg) + + def test_empty_frame_is_left_to_caller(self): + # Empty is a no-data condition handled elsewhere, not a staleness one. + _assert_ohlcv_not_stale( + pd.DataFrame(columns=["Date", "Close"]), "2026-06-11", "X" + ) + + def test_long_holiday_gap_within_threshold_is_accepted(self): + _assert_ohlcv_not_stale(_frame("2026-06-02"), "2026-06-11", "X") # 9 days + + +@pytest.mark.unit +class StaleGuardPropagationTests(unittest.TestCase): + def test_get_yfin_data_online_raises_on_stale_frame(self): + stale = pd.DataFrame( + { + "Open": [280.0], "High": [286.0], "Low": [278.0], + "Close": [284.45], "Volume": [1_000_000], + }, + index=pd.DatetimeIndex([pd.Timestamp("2025-06-11")], name="Date"), + ) + + class DummyTicker: + def __init__(self, symbol): + pass + + def history(self, start, end): + return stale + + with mock.patch.object(y_finance.yf, "Ticker", DummyTicker), \ + self.assertRaises(NoMarketDataError): + y_finance.get_YFin_data_online("CB", "2026-06-01", "2026-06-11") + + +@pytest.mark.unit +class StaleGuardRoutingTests(unittest.TestCase): + def setUp(self): + config_module._config = copy.deepcopy(default_config.DEFAULT_CONFIG) + + def tearDown(self): + config_module._config = copy.deepcopy(default_config.DEFAULT_CONFIG) + + def test_router_sentinel_surfaces_stale_reason(self): + set_config({"data_vendors": {"core_stock_apis": "yfinance"}}) + + def _stale(symbol, *a, **k): + raise NoMarketDataError( + symbol, symbol, "latest row is 2025-06-11, 365 days before ... (stale)" + ) + + with mock.patch.dict( + interface.VENDOR_METHODS, + {"get_stock_data": {"yfinance": _stale}}, + clear=False, + ): + out = interface.route_to_vendor( + "get_stock_data", "CB", "2026-06-01", "2026-06-11" + ) + self.assertIn("NO_DATA_AVAILABLE", out) + self.assertIn("stale", out) # the typed detail is surfaced to the agent + + +if __name__ == "__main__": + unittest.main() diff --git a/tradingagents/dataflows/stockstats_utils.py b/tradingagents/dataflows/stockstats_utils.py index 585e581ea..db880b046 100644 --- a/tradingagents/dataflows/stockstats_utils.py +++ b/tradingagents/dataflows/stockstats_utils.py @@ -1,18 +1,24 @@ -import time import logging +import os +import time +from typing import Annotated import pandas as pd import yfinance as yf -from yfinance.exceptions import YFRateLimitError from stockstats import wrap -from typing import Annotated -import os +from yfinance.exceptions import YFRateLimitError + from .config import get_config +from .symbol_utils import NoMarketDataError, normalize_symbol from .utils import safe_ticker_component -from .symbol_utils import normalize_symbol, NoMarketDataError logger = logging.getLogger(__name__) +# A vendor's latest OHLCV row this many calendar days before the requested date +# is treated as stale. Generous enough to span long holiday weekends, tight +# enough to catch the year-old frames yfinance occasionally returns (#1021). +MAX_OHLCV_STALE_DAYS = 10 + def yf_retry(func, max_retries=3, base_delay=2.0): """Execute a yfinance call with exponential backoff on rate limits. @@ -62,6 +68,60 @@ def _clean_dataframe(data: pd.DataFrame) -> pd.DataFrame: return data +def _coerce_ohlcv_dates(data: pd.DataFrame) -> pd.Series: + """Return parsed dates from an OHLCV frame, whether Date is a column or the index.""" + if "Date" in data.columns: + return pd.to_datetime(data["Date"], errors="coerce").dropna() + # yfinance keeps the dates in the index (a DatetimeIndex, sometimes unnamed). + if isinstance(data.index, pd.DatetimeIndex): + return pd.Series(pd.to_datetime(data.index, errors="coerce")).dropna() + # Fallback: expose the index and look for any date-like column. + df = data.reset_index() + for col in ("Date", "Datetime", "date", "index"): + if col in df.columns: + parsed = pd.to_datetime(df[col], errors="coerce").dropna() + if not parsed.empty: + return parsed + return pd.Series(dtype="datetime64[ns]") + + +def _assert_ohlcv_not_stale( + data: pd.DataFrame, + curr_date: str, + symbol: str, + canonical: str | None = None, + *, + max_stale_days: int = MAX_OHLCV_STALE_DAYS, +) -> None: + """Reject OHLCV whose latest row is far older than curr_date. + + Raises NoMarketDataError (with a stale-specific detail) so the router treats + it like any other "no usable data from this vendor" — try the next vendor, + then emit one clear unavailable signal. Empty frames are left to the + caller's existing no-data handling; this guards only the dangerous case of + present-but-stale rows (a vendor returning a year-old frame that would + otherwise feed wrong prices to the agent, #1021). + """ + if data is None or data.empty: + return + requested = pd.to_datetime(curr_date, errors="coerce") + if pd.isna(requested): + return + requested = requested.normalize() + dates = _coerce_ohlcv_dates(data) + if dates.empty: + return + latest = dates.max().normalize() + stale_days = (requested - latest).days + if stale_days > max_stale_days: + raise NoMarketDataError( + symbol, + canonical, + f"latest row is {latest.date()}, {stale_days} days before the " + f"requested {requested.date()} (stale) — refusing to use it", + ) + + def load_ohlcv(symbol: str, curr_date: str) -> pd.DataFrame: """Fetch OHLCV data with caching, filtered to prevent look-ahead bias. @@ -125,6 +185,10 @@ def load_ohlcv(symbol: str, curr_date: str) -> pd.DataFrame: # Filter to curr_date to prevent look-ahead bias in backtesting data = data[data["Date"] <= curr_date_dt] + # Reject a stale frame (latest row far older than curr_date) rather than + # feeding year-old prices into indicators (#1021). + _assert_ohlcv_not_stale(data, curr_date, symbol, canonical) + return data diff --git a/tradingagents/dataflows/y_finance.py b/tradingagents/dataflows/y_finance.py index 5e0cea865..fdb49d52d 100644 --- a/tradingagents/dataflows/y_finance.py +++ b/tradingagents/dataflows/y_finance.py @@ -1,11 +1,19 @@ -from typing import Annotated from datetime import datetime -from dateutil.relativedelta import relativedelta +from typing import Annotated + import pandas as pd import yfinance as yf -import os -from .stockstats_utils import StockstatsUtils, _clean_dataframe, yf_retry, load_ohlcv, filter_financials_by_date -from .symbol_utils import normalize_symbol, NoMarketDataError +from dateutil.relativedelta import relativedelta + +from .stockstats_utils import ( + StockstatsUtils, + _assert_ohlcv_not_stale, + filter_financials_by_date, + load_ohlcv, + yf_retry, +) +from .symbol_utils import NoMarketDataError, normalize_symbol + def get_YFin_data_online( symbol: Annotated[str, "ticker symbol of the company"], @@ -38,6 +46,11 @@ def get_YFin_data_online( if data.index.tz is not None: data.index = data.index.tz_localize(None) + # Reject a stale frame (e.g. a year-old partial response) before it is + # formatted into the report. Raises NoMarketDataError, which the router + # turns into one clear unavailable signal (#1021). + _assert_ohlcv_not_stale(data, end_date, symbol, canonical) + # Round numerical values to 2 decimal places for cleaner display numeric_columns = ["Open", "High", "Low", "Close", "Adj Close"] for col in numeric_columns: @@ -150,23 +163,23 @@ def get_stock_stats_indicators_window( # Optimized: Get stock data once and calculate indicators for all dates try: indicator_data = _get_stock_stats_bulk(symbol, indicator, curr_date) - + # Generate the date range we need current_dt = curr_date_dt date_values = [] - + while current_dt >= before: date_str = current_dt.strftime('%Y-%m-%d') - + # Look up the indicator value for this date if date_str in indicator_data: indicator_value = indicator_data[date_str] else: indicator_value = "N/A: Not a trading day (weekend or holiday)" - + date_values.append((date_str, indicator_value)) current_dt = current_dt - relativedelta(days=1) - + # Build the result string ind_string = "" for date_str, value in date_values: @@ -211,22 +224,22 @@ def _get_stock_stats_bulk( data = load_ohlcv(symbol, curr_date) df = wrap(data) df["Date"] = df["Date"].dt.strftime("%Y-%m-%d") - + # Calculate the indicator for all rows at once df[indicator] # This triggers stockstats to calculate the indicator - + # Create a dictionary mapping date strings to indicator values result_dict = {} for _, row in df.iterrows(): date_str = row["Date"] indicator_value = row[indicator] - + # Handle NaN/None values if pd.isna(indicator_value): result_dict[date_str] = "N/A" else: result_dict[date_str] = str(indicator_value) - + return result_dict @@ -450,8 +463,8 @@ def get_insider_transactions( # Add header information header = f"# Insider Transactions data for {canonical}\n" header += f"# Data retrieved on: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n\n" - + return header + csv_string - + except Exception as e: - return f"Error retrieving insider transactions for {ticker}: {str(e)}" \ No newline at end of file + return f"Error retrieving insider transactions for {ticker}: {str(e)}"