mirror of
https://github.com/TauricResearch/TradingAgents.git
synced 2026-06-16 21:06:15 +03:00
fix(data): reject stale yfinance OHLCV instead of reporting wrong prices
yfinance intermittently returns a year-old partial frame (e.g. June 2025 rows for a June 2026 request) that still has rows and a Close, so it passed the empty-check and silently fed a wrong close price and indicators into the report (#1021). Add a freshness guard that rejects a frame whose latest row is far older than the requested date, on both the raw OHLCV path and the indicator path. It raises the existing NoMarketDataError with a stale-specific detail, so the vendor router's try-next-vendor and single unavailable-signal handling apply unchanged; the sentinel now surfaces that detail so the agent reports the specific reason rather than fabricating a value.
This commit is contained in:
113
tests/test_yfinance_stale_ohlcv_guard.py
Normal file
113
tests/test_yfinance_stale_ohlcv_guard.py
Normal file
@@ -0,0 +1,113 @@
|
||||
"""Stale OHLCV guard (#1021): a vendor returning a year-old partial frame must
|
||||
be rejected, not fed into the report as if it were current.
|
||||
|
||||
The guard raises NoMarketDataError with a stale-specific detail, so the router's
|
||||
existing try-next-vendor + single-sentinel handling applies and the sentinel
|
||||
surfaces the reason.
|
||||
"""
|
||||
import copy
|
||||
import unittest
|
||||
from unittest import mock
|
||||
|
||||
import pandas as pd
|
||||
import pytest
|
||||
|
||||
import tradingagents.dataflows.config as config_module
|
||||
import tradingagents.dataflows.y_finance as y_finance
|
||||
import tradingagents.default_config as default_config
|
||||
from tradingagents.dataflows import interface
|
||||
from tradingagents.dataflows.config import set_config
|
||||
from tradingagents.dataflows.stockstats_utils import _assert_ohlcv_not_stale
|
||||
from tradingagents.dataflows.symbol_utils import NoMarketDataError
|
||||
|
||||
|
||||
def _frame(date):
|
||||
return pd.DataFrame(
|
||||
{
|
||||
"Date": [pd.Timestamp(date)],
|
||||
"Open": [330.0],
|
||||
"High": [332.0],
|
||||
"Low": [328.0],
|
||||
"Close": [330.58],
|
||||
"Volume": [1_000_000],
|
||||
}
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.unit
|
||||
class StaleGuardUnitTests(unittest.TestCase):
|
||||
def test_recent_prior_trading_day_is_accepted(self):
|
||||
# 1 day before curr_date — well within the freshness window.
|
||||
_assert_ohlcv_not_stale(_frame("2026-06-10"), "2026-06-11", "CB")
|
||||
|
||||
def test_year_old_row_is_rejected_with_detail(self):
|
||||
with self.assertRaises(NoMarketDataError) as ctx:
|
||||
_assert_ohlcv_not_stale(_frame("2025-06-11"), "2026-06-11", "CB", "CB")
|
||||
msg = str(ctx.exception)
|
||||
self.assertIn("2025-06-11", msg)
|
||||
self.assertIn("2026-06-11", msg)
|
||||
self.assertIn("stale", msg)
|
||||
|
||||
def test_empty_frame_is_left_to_caller(self):
|
||||
# Empty is a no-data condition handled elsewhere, not a staleness one.
|
||||
_assert_ohlcv_not_stale(
|
||||
pd.DataFrame(columns=["Date", "Close"]), "2026-06-11", "X"
|
||||
)
|
||||
|
||||
def test_long_holiday_gap_within_threshold_is_accepted(self):
|
||||
_assert_ohlcv_not_stale(_frame("2026-06-02"), "2026-06-11", "X") # 9 days
|
||||
|
||||
|
||||
@pytest.mark.unit
|
||||
class StaleGuardPropagationTests(unittest.TestCase):
|
||||
def test_get_yfin_data_online_raises_on_stale_frame(self):
|
||||
stale = pd.DataFrame(
|
||||
{
|
||||
"Open": [280.0], "High": [286.0], "Low": [278.0],
|
||||
"Close": [284.45], "Volume": [1_000_000],
|
||||
},
|
||||
index=pd.DatetimeIndex([pd.Timestamp("2025-06-11")], name="Date"),
|
||||
)
|
||||
|
||||
class DummyTicker:
|
||||
def __init__(self, symbol):
|
||||
pass
|
||||
|
||||
def history(self, start, end):
|
||||
return stale
|
||||
|
||||
with mock.patch.object(y_finance.yf, "Ticker", DummyTicker), \
|
||||
self.assertRaises(NoMarketDataError):
|
||||
y_finance.get_YFin_data_online("CB", "2026-06-01", "2026-06-11")
|
||||
|
||||
|
||||
@pytest.mark.unit
|
||||
class StaleGuardRoutingTests(unittest.TestCase):
|
||||
def setUp(self):
|
||||
config_module._config = copy.deepcopy(default_config.DEFAULT_CONFIG)
|
||||
|
||||
def tearDown(self):
|
||||
config_module._config = copy.deepcopy(default_config.DEFAULT_CONFIG)
|
||||
|
||||
def test_router_sentinel_surfaces_stale_reason(self):
|
||||
set_config({"data_vendors": {"core_stock_apis": "yfinance"}})
|
||||
|
||||
def _stale(symbol, *a, **k):
|
||||
raise NoMarketDataError(
|
||||
symbol, symbol, "latest row is 2025-06-11, 365 days before ... (stale)"
|
||||
)
|
||||
|
||||
with mock.patch.dict(
|
||||
interface.VENDOR_METHODS,
|
||||
{"get_stock_data": {"yfinance": _stale}},
|
||||
clear=False,
|
||||
):
|
||||
out = interface.route_to_vendor(
|
||||
"get_stock_data", "CB", "2026-06-01", "2026-06-11"
|
||||
)
|
||||
self.assertIn("NO_DATA_AVAILABLE", out)
|
||||
self.assertIn("stale", out) # the typed detail is surfaced to the agent
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
@@ -1,18 +1,24 @@
|
||||
import time
|
||||
import logging
|
||||
import os
|
||||
import time
|
||||
from typing import Annotated
|
||||
|
||||
import pandas as pd
|
||||
import yfinance as yf
|
||||
from yfinance.exceptions import YFRateLimitError
|
||||
from stockstats import wrap
|
||||
from typing import Annotated
|
||||
import os
|
||||
from yfinance.exceptions import YFRateLimitError
|
||||
|
||||
from .config import get_config
|
||||
from .symbol_utils import NoMarketDataError, normalize_symbol
|
||||
from .utils import safe_ticker_component
|
||||
from .symbol_utils import normalize_symbol, NoMarketDataError
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# A vendor's latest OHLCV row this many calendar days before the requested date
|
||||
# is treated as stale. Generous enough to span long holiday weekends, tight
|
||||
# enough to catch the year-old frames yfinance occasionally returns (#1021).
|
||||
MAX_OHLCV_STALE_DAYS = 10
|
||||
|
||||
|
||||
def yf_retry(func, max_retries=3, base_delay=2.0):
|
||||
"""Execute a yfinance call with exponential backoff on rate limits.
|
||||
@@ -62,6 +68,60 @@ def _clean_dataframe(data: pd.DataFrame) -> pd.DataFrame:
|
||||
return data
|
||||
|
||||
|
||||
def _coerce_ohlcv_dates(data: pd.DataFrame) -> pd.Series:
|
||||
"""Return parsed dates from an OHLCV frame, whether Date is a column or the index."""
|
||||
if "Date" in data.columns:
|
||||
return pd.to_datetime(data["Date"], errors="coerce").dropna()
|
||||
# yfinance keeps the dates in the index (a DatetimeIndex, sometimes unnamed).
|
||||
if isinstance(data.index, pd.DatetimeIndex):
|
||||
return pd.Series(pd.to_datetime(data.index, errors="coerce")).dropna()
|
||||
# Fallback: expose the index and look for any date-like column.
|
||||
df = data.reset_index()
|
||||
for col in ("Date", "Datetime", "date", "index"):
|
||||
if col in df.columns:
|
||||
parsed = pd.to_datetime(df[col], errors="coerce").dropna()
|
||||
if not parsed.empty:
|
||||
return parsed
|
||||
return pd.Series(dtype="datetime64[ns]")
|
||||
|
||||
|
||||
def _assert_ohlcv_not_stale(
|
||||
data: pd.DataFrame,
|
||||
curr_date: str,
|
||||
symbol: str,
|
||||
canonical: str | None = None,
|
||||
*,
|
||||
max_stale_days: int = MAX_OHLCV_STALE_DAYS,
|
||||
) -> None:
|
||||
"""Reject OHLCV whose latest row is far older than curr_date.
|
||||
|
||||
Raises NoMarketDataError (with a stale-specific detail) so the router treats
|
||||
it like any other "no usable data from this vendor" — try the next vendor,
|
||||
then emit one clear unavailable signal. Empty frames are left to the
|
||||
caller's existing no-data handling; this guards only the dangerous case of
|
||||
present-but-stale rows (a vendor returning a year-old frame that would
|
||||
otherwise feed wrong prices to the agent, #1021).
|
||||
"""
|
||||
if data is None or data.empty:
|
||||
return
|
||||
requested = pd.to_datetime(curr_date, errors="coerce")
|
||||
if pd.isna(requested):
|
||||
return
|
||||
requested = requested.normalize()
|
||||
dates = _coerce_ohlcv_dates(data)
|
||||
if dates.empty:
|
||||
return
|
||||
latest = dates.max().normalize()
|
||||
stale_days = (requested - latest).days
|
||||
if stale_days > max_stale_days:
|
||||
raise NoMarketDataError(
|
||||
symbol,
|
||||
canonical,
|
||||
f"latest row is {latest.date()}, {stale_days} days before the "
|
||||
f"requested {requested.date()} (stale) — refusing to use it",
|
||||
)
|
||||
|
||||
|
||||
def load_ohlcv(symbol: str, curr_date: str) -> pd.DataFrame:
|
||||
"""Fetch OHLCV data with caching, filtered to prevent look-ahead bias.
|
||||
|
||||
@@ -125,6 +185,10 @@ def load_ohlcv(symbol: str, curr_date: str) -> pd.DataFrame:
|
||||
# Filter to curr_date to prevent look-ahead bias in backtesting
|
||||
data = data[data["Date"] <= curr_date_dt]
|
||||
|
||||
# Reject a stale frame (latest row far older than curr_date) rather than
|
||||
# feeding year-old prices into indicators (#1021).
|
||||
_assert_ohlcv_not_stale(data, curr_date, symbol, canonical)
|
||||
|
||||
return data
|
||||
|
||||
|
||||
|
||||
@@ -1,11 +1,19 @@
|
||||
from typing import Annotated
|
||||
from datetime import datetime
|
||||
from dateutil.relativedelta import relativedelta
|
||||
from typing import Annotated
|
||||
|
||||
import pandas as pd
|
||||
import yfinance as yf
|
||||
import os
|
||||
from .stockstats_utils import StockstatsUtils, _clean_dataframe, yf_retry, load_ohlcv, filter_financials_by_date
|
||||
from .symbol_utils import normalize_symbol, NoMarketDataError
|
||||
from dateutil.relativedelta import relativedelta
|
||||
|
||||
from .stockstats_utils import (
|
||||
StockstatsUtils,
|
||||
_assert_ohlcv_not_stale,
|
||||
filter_financials_by_date,
|
||||
load_ohlcv,
|
||||
yf_retry,
|
||||
)
|
||||
from .symbol_utils import NoMarketDataError, normalize_symbol
|
||||
|
||||
|
||||
def get_YFin_data_online(
|
||||
symbol: Annotated[str, "ticker symbol of the company"],
|
||||
@@ -38,6 +46,11 @@ def get_YFin_data_online(
|
||||
if data.index.tz is not None:
|
||||
data.index = data.index.tz_localize(None)
|
||||
|
||||
# Reject a stale frame (e.g. a year-old partial response) before it is
|
||||
# formatted into the report. Raises NoMarketDataError, which the router
|
||||
# turns into one clear unavailable signal (#1021).
|
||||
_assert_ohlcv_not_stale(data, end_date, symbol, canonical)
|
||||
|
||||
# Round numerical values to 2 decimal places for cleaner display
|
||||
numeric_columns = ["Open", "High", "Low", "Close", "Adj Close"]
|
||||
for col in numeric_columns:
|
||||
|
||||
Reference in New Issue
Block a user