fix(data): reject stale yfinance OHLCV instead of reporting wrong prices

yfinance intermittently returns a year-old partial frame (e.g. June 2025 rows
for a June 2026 request) that still has rows and a Close, so it passed the
empty-check and silently fed a wrong close price and indicators into the report
(#1021). Add a freshness guard that rejects a frame whose latest row is far
older than the requested date, on both the raw OHLCV path and the indicator
path. It raises the existing NoMarketDataError with a stale-specific detail, so
the vendor router's try-next-vendor and single unavailable-signal handling apply
unchanged; the sentinel now surfaces that detail so the agent reports the
specific reason rather than fabricating a value.
This commit is contained in:
Yijia-Xiao
2026-06-14 07:10:15 +00:00
parent 7df18fc912
commit 9fd54f8368
3 changed files with 212 additions and 22 deletions

View File

@@ -0,0 +1,113 @@
"""Stale OHLCV guard (#1021): a vendor returning a year-old partial frame must
be rejected, not fed into the report as if it were current.
The guard raises NoMarketDataError with a stale-specific detail, so the router's
existing try-next-vendor + single-sentinel handling applies and the sentinel
surfaces the reason.
"""
import copy
import unittest
from unittest import mock
import pandas as pd
import pytest
import tradingagents.dataflows.config as config_module
import tradingagents.dataflows.y_finance as y_finance
import tradingagents.default_config as default_config
from tradingagents.dataflows import interface
from tradingagents.dataflows.config import set_config
from tradingagents.dataflows.stockstats_utils import _assert_ohlcv_not_stale
from tradingagents.dataflows.symbol_utils import NoMarketDataError
def _frame(date):
return pd.DataFrame(
{
"Date": [pd.Timestamp(date)],
"Open": [330.0],
"High": [332.0],
"Low": [328.0],
"Close": [330.58],
"Volume": [1_000_000],
}
)
@pytest.mark.unit
class StaleGuardUnitTests(unittest.TestCase):
def test_recent_prior_trading_day_is_accepted(self):
# 1 day before curr_date — well within the freshness window.
_assert_ohlcv_not_stale(_frame("2026-06-10"), "2026-06-11", "CB")
def test_year_old_row_is_rejected_with_detail(self):
with self.assertRaises(NoMarketDataError) as ctx:
_assert_ohlcv_not_stale(_frame("2025-06-11"), "2026-06-11", "CB", "CB")
msg = str(ctx.exception)
self.assertIn("2025-06-11", msg)
self.assertIn("2026-06-11", msg)
self.assertIn("stale", msg)
def test_empty_frame_is_left_to_caller(self):
# Empty is a no-data condition handled elsewhere, not a staleness one.
_assert_ohlcv_not_stale(
pd.DataFrame(columns=["Date", "Close"]), "2026-06-11", "X"
)
def test_long_holiday_gap_within_threshold_is_accepted(self):
_assert_ohlcv_not_stale(_frame("2026-06-02"), "2026-06-11", "X") # 9 days
@pytest.mark.unit
class StaleGuardPropagationTests(unittest.TestCase):
def test_get_yfin_data_online_raises_on_stale_frame(self):
stale = pd.DataFrame(
{
"Open": [280.0], "High": [286.0], "Low": [278.0],
"Close": [284.45], "Volume": [1_000_000],
},
index=pd.DatetimeIndex([pd.Timestamp("2025-06-11")], name="Date"),
)
class DummyTicker:
def __init__(self, symbol):
pass
def history(self, start, end):
return stale
with mock.patch.object(y_finance.yf, "Ticker", DummyTicker), \
self.assertRaises(NoMarketDataError):
y_finance.get_YFin_data_online("CB", "2026-06-01", "2026-06-11")
@pytest.mark.unit
class StaleGuardRoutingTests(unittest.TestCase):
def setUp(self):
config_module._config = copy.deepcopy(default_config.DEFAULT_CONFIG)
def tearDown(self):
config_module._config = copy.deepcopy(default_config.DEFAULT_CONFIG)
def test_router_sentinel_surfaces_stale_reason(self):
set_config({"data_vendors": {"core_stock_apis": "yfinance"}})
def _stale(symbol, *a, **k):
raise NoMarketDataError(
symbol, symbol, "latest row is 2025-06-11, 365 days before ... (stale)"
)
with mock.patch.dict(
interface.VENDOR_METHODS,
{"get_stock_data": {"yfinance": _stale}},
clear=False,
):
out = interface.route_to_vendor(
"get_stock_data", "CB", "2026-06-01", "2026-06-11"
)
self.assertIn("NO_DATA_AVAILABLE", out)
self.assertIn("stale", out) # the typed detail is surfaced to the agent
if __name__ == "__main__":
unittest.main()

View File

@@ -1,18 +1,24 @@
import time
import logging
import os
import time
from typing import Annotated
import pandas as pd
import yfinance as yf
from yfinance.exceptions import YFRateLimitError
from stockstats import wrap
from typing import Annotated
import os
from yfinance.exceptions import YFRateLimitError
from .config import get_config
from .symbol_utils import NoMarketDataError, normalize_symbol
from .utils import safe_ticker_component
from .symbol_utils import normalize_symbol, NoMarketDataError
logger = logging.getLogger(__name__)
# A vendor's latest OHLCV row this many calendar days before the requested date
# is treated as stale. Generous enough to span long holiday weekends, tight
# enough to catch the year-old frames yfinance occasionally returns (#1021).
MAX_OHLCV_STALE_DAYS = 10
def yf_retry(func, max_retries=3, base_delay=2.0):
"""Execute a yfinance call with exponential backoff on rate limits.
@@ -62,6 +68,60 @@ def _clean_dataframe(data: pd.DataFrame) -> pd.DataFrame:
return data
def _coerce_ohlcv_dates(data: pd.DataFrame) -> pd.Series:
"""Return parsed dates from an OHLCV frame, whether Date is a column or the index."""
if "Date" in data.columns:
return pd.to_datetime(data["Date"], errors="coerce").dropna()
# yfinance keeps the dates in the index (a DatetimeIndex, sometimes unnamed).
if isinstance(data.index, pd.DatetimeIndex):
return pd.Series(pd.to_datetime(data.index, errors="coerce")).dropna()
# Fallback: expose the index and look for any date-like column.
df = data.reset_index()
for col in ("Date", "Datetime", "date", "index"):
if col in df.columns:
parsed = pd.to_datetime(df[col], errors="coerce").dropna()
if not parsed.empty:
return parsed
return pd.Series(dtype="datetime64[ns]")
def _assert_ohlcv_not_stale(
data: pd.DataFrame,
curr_date: str,
symbol: str,
canonical: str | None = None,
*,
max_stale_days: int = MAX_OHLCV_STALE_DAYS,
) -> None:
"""Reject OHLCV whose latest row is far older than curr_date.
Raises NoMarketDataError (with a stale-specific detail) so the router treats
it like any other "no usable data from this vendor" — try the next vendor,
then emit one clear unavailable signal. Empty frames are left to the
caller's existing no-data handling; this guards only the dangerous case of
present-but-stale rows (a vendor returning a year-old frame that would
otherwise feed wrong prices to the agent, #1021).
"""
if data is None or data.empty:
return
requested = pd.to_datetime(curr_date, errors="coerce")
if pd.isna(requested):
return
requested = requested.normalize()
dates = _coerce_ohlcv_dates(data)
if dates.empty:
return
latest = dates.max().normalize()
stale_days = (requested - latest).days
if stale_days > max_stale_days:
raise NoMarketDataError(
symbol,
canonical,
f"latest row is {latest.date()}, {stale_days} days before the "
f"requested {requested.date()} (stale) — refusing to use it",
)
def load_ohlcv(symbol: str, curr_date: str) -> pd.DataFrame:
"""Fetch OHLCV data with caching, filtered to prevent look-ahead bias.
@@ -125,6 +185,10 @@ def load_ohlcv(symbol: str, curr_date: str) -> pd.DataFrame:
# Filter to curr_date to prevent look-ahead bias in backtesting
data = data[data["Date"] <= curr_date_dt]
# Reject a stale frame (latest row far older than curr_date) rather than
# feeding year-old prices into indicators (#1021).
_assert_ohlcv_not_stale(data, curr_date, symbol, canonical)
return data

View File

@@ -1,11 +1,19 @@
from typing import Annotated
from datetime import datetime
from dateutil.relativedelta import relativedelta
from typing import Annotated
import pandas as pd
import yfinance as yf
import os
from .stockstats_utils import StockstatsUtils, _clean_dataframe, yf_retry, load_ohlcv, filter_financials_by_date
from .symbol_utils import normalize_symbol, NoMarketDataError
from dateutil.relativedelta import relativedelta
from .stockstats_utils import (
StockstatsUtils,
_assert_ohlcv_not_stale,
filter_financials_by_date,
load_ohlcv,
yf_retry,
)
from .symbol_utils import NoMarketDataError, normalize_symbol
def get_YFin_data_online(
symbol: Annotated[str, "ticker symbol of the company"],
@@ -38,6 +46,11 @@ def get_YFin_data_online(
if data.index.tz is not None:
data.index = data.index.tz_localize(None)
# Reject a stale frame (e.g. a year-old partial response) before it is
# formatted into the report. Raises NoMarketDataError, which the router
# turns into one clear unavailable signal (#1021).
_assert_ohlcv_not_stale(data, end_date, symbol, canonical)
# Round numerical values to 2 decimal places for cleaner display
numeric_columns = ["Open", "High", "Low", "Close", "Adj Close"]
for col in numeric_columns:
@@ -150,23 +163,23 @@ def get_stock_stats_indicators_window(
# Optimized: Get stock data once and calculate indicators for all dates
try:
indicator_data = _get_stock_stats_bulk(symbol, indicator, curr_date)
# Generate the date range we need
current_dt = curr_date_dt
date_values = []
while current_dt >= before:
date_str = current_dt.strftime('%Y-%m-%d')
# Look up the indicator value for this date
if date_str in indicator_data:
indicator_value = indicator_data[date_str]
else:
indicator_value = "N/A: Not a trading day (weekend or holiday)"
date_values.append((date_str, indicator_value))
current_dt = current_dt - relativedelta(days=1)
# Build the result string
ind_string = ""
for date_str, value in date_values:
@@ -211,22 +224,22 @@ def _get_stock_stats_bulk(
data = load_ohlcv(symbol, curr_date)
df = wrap(data)
df["Date"] = df["Date"].dt.strftime("%Y-%m-%d")
# Calculate the indicator for all rows at once
df[indicator] # This triggers stockstats to calculate the indicator
# Create a dictionary mapping date strings to indicator values
result_dict = {}
for _, row in df.iterrows():
date_str = row["Date"]
indicator_value = row[indicator]
# Handle NaN/None values
if pd.isna(indicator_value):
result_dict[date_str] = "N/A"
else:
result_dict[date_str] = str(indicator_value)
return result_dict
@@ -450,8 +463,8 @@ def get_insider_transactions(
# Add header information
header = f"# Insider Transactions data for {canonical}\n"
header += f"# Data retrieved on: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n\n"
return header + csv_string
except Exception as e:
return f"Error retrieving insider transactions for {ticker}: {str(e)}"
return f"Error retrieving insider transactions for {ticker}: {str(e)}"