From 0c1231a4057c4348d5f1fa1f45c1002921b0d035 Mon Sep 17 00:00:00 2001 From: Yijia-Xiao Date: Sat, 13 Jun 2026 21:54:07 +0000 Subject: [PATCH] fix(data): keep future/undated news out of historical windows The yfinance news date filter only ran when an article had a parsed date, so flat-format and undated articles bypassed it and leaked future news into historical/backtest runs. Parse the flat providerPublishTime, apply one look-ahead-safe window rule across ticker and global news (undated kept only when the window reaches the present), and return an informative message when everything is filtered out. --- tests/test_news_lookahead.py | 80 ++++++++++++++++++++++++ tradingagents/dataflows/yfinance_news.py | 75 +++++++++++++--------- 2 files changed, 125 insertions(+), 30 deletions(-) create mode 100644 tests/test_news_lookahead.py diff --git a/tests/test_news_lookahead.py b/tests/test_news_lookahead.py new file mode 100644 index 000000000..f58d54c62 --- /dev/null +++ b/tests/test_news_lookahead.py @@ -0,0 +1,80 @@ +"""yfinance news must not leak future-dated (or undated, in a backtest) articles +into a historical window. + +Regressions for #992 (flat articles bypassed the date filter), #1007 (global +news injected future articles), #993 (empty-after-filter returned a blank body). +""" +import time +from datetime import datetime +from unittest import mock + +import pytest + +import tradingagents.dataflows.yfinance_news as ynews + + +def _epoch(date_str): + return int(time.mktime(datetime.strptime(date_str, "%Y-%m-%d").timetuple())) + + +@pytest.mark.unit +def test_flat_article_publish_time_is_parsed(): + # #992: flat articles now carry a pub_date (was always None -> unfilterable). + data = ynews._extract_article_data( + {"title": "X", "publisher": "P", "link": "l", "providerPublishTime": _epoch("2025-05-09")} + ) + assert data["pub_date"] is not None + assert data["pub_date"].strftime("%Y-%m-%d") == "2025-05-09" + + +@pytest.mark.unit +def test_window_excludes_future_and_undated_in_backtest(): + start = datetime(2025, 5, 1) + end = datetime(2025, 5, 9) # historical window (well in the past) + inside = datetime(2025, 5, 5) + future = datetime(2025, 6, 1) + assert ynews._in_news_window(inside, start, end) is True + assert ynews._in_news_window(future, start, end) is False # look-ahead blocked + assert ynews._in_news_window(None, start, end) is False # undated -> excluded in backtest + + +@pytest.mark.unit +def test_window_keeps_undated_in_live_window(): + # Live window (reaches today): undated articles can't be "future", so keep them. + start = datetime.now() + end = datetime.now() + assert ynews._in_news_window(None, start, end) is True + + +@pytest.mark.unit +def test_global_news_future_flat_article_excluded(monkeypatch): + # #1007: a flat, future-dated global article must not appear in a historical run. + future_article = {"title": "FUTURE EVENT", "publisher": "P", "link": "l", + "providerPublishTime": _epoch("2025-06-01")} + past_article = {"title": "PAST EVENT", "publisher": "P", "link": "l", + "providerPublishTime": _epoch("2025-05-05")} + + class FakeSearch: + def __init__(self, *a, **k): + self.news = [future_article, past_article] + + monkeypatch.setattr(ynews.yf, "Search", FakeSearch) + out = ynews.get_global_news_yfinance("2025-05-09", look_back_days=7, limit=10) + assert "PAST EVENT" in out + assert "FUTURE EVENT" not in out # #1007 + + +@pytest.mark.unit +def test_global_news_empty_after_filter_is_informative(monkeypatch): + # #993: everything filtered out -> a clear message, not a blank-bodied report. + only_future = {"title": "FUTURE", "publisher": "P", "link": "l", + "providerPublishTime": _epoch("2025-06-01")} + + class FakeSearch: + def __init__(self, *a, **k): + self.news = [only_future] + + monkeypatch.setattr(ynews.yf, "Search", FakeSearch) + out = ynews.get_global_news_yfinance("2025-05-09", look_back_days=7, limit=10) + assert "No global news found" in out + assert "###" not in out # no empty article body diff --git a/tradingagents/dataflows/yfinance_news.py b/tradingagents/dataflows/yfinance_news.py index 55c5d2512..fa0c342be 100644 --- a/tradingagents/dataflows/yfinance_news.py +++ b/tradingagents/dataflows/yfinance_news.py @@ -41,16 +41,39 @@ def _extract_article_data(article: dict) -> dict: "pub_date": pub_date, } else: - # Fallback for flat structure + # Fallback for flat structure. Parse the epoch publish time so flat + # articles are date-filterable too (otherwise they bypass the + # historical window and leak future news, #992/#1007). + pub_date = None + ts = article.get("providerPublishTime") + if ts: + try: + pub_date = datetime.fromtimestamp(ts) + except (ValueError, OSError, TypeError): + pass return { "title": article.get("title", "No title"), "summary": article.get("summary", ""), "publisher": article.get("publisher", "Unknown"), "link": article.get("link", ""), - "pub_date": None, + "pub_date": pub_date, } +def _in_news_window(pub_date, start_dt, end_dt) -> bool: + """Whether an article belongs in the [start_dt, end_dt] window. + + Dated articles are kept only if they fall in the window. An undated article + is kept only when the window reaches the present (live run) — in a + historical/backtest window it's excluded, since we can't prove it isn't + future news (look-ahead safety, #992/#1007). + """ + if pub_date is not None: + naive = pub_date.replace(tzinfo=None) if hasattr(pub_date, "replace") else pub_date + return start_dt <= naive <= end_dt + relativedelta(days=1) + return end_dt >= datetime.now() - relativedelta(days=1) + + def get_news_yfinance( ticker: str, start_date: str, @@ -85,11 +108,9 @@ def get_news_yfinance( for article in news: data = _extract_article_data(article) - # Filter by date if publish time is available - if data["pub_date"]: - pub_date_naive = data["pub_date"].replace(tzinfo=None) - if not (start_dt <= pub_date_naive <= end_dt + relativedelta(days=1)): - continue + # Keep only articles within the requested window (look-ahead safe). + if not _in_news_window(data["pub_date"], start_dt, end_dt): + continue news_str += f"### {data['title']} (source: {data['publisher']})\n" if data["summary"]: @@ -170,31 +191,25 @@ def get_global_news_yfinance( start_date = start_dt.strftime("%Y-%m-%d") news_str = "" + kept = 0 for article in all_news[:limit]: - # Handle both flat and nested structures - if "content" in article: - data = _extract_article_data(article) - # Skip articles published after curr_date (look-ahead guard) - if data.get("pub_date"): - pub_naive = data["pub_date"].replace(tzinfo=None) if hasattr(data["pub_date"], "replace") else data["pub_date"] - if pub_naive > curr_dt + relativedelta(days=1): - continue - title = data["title"] - publisher = data["publisher"] - link = data["link"] - summary = data["summary"] - else: - title = article.get("title", "No title") - publisher = article.get("publisher", "Unknown") - link = article.get("link", "") - summary = "" - - news_str += f"### {title} (source: {publisher})\n" - if summary: - news_str += f"{summary}\n" - if link: - news_str += f"Link: {link}\n" + # Extract uniformly (flat + nested) and apply the same look-ahead-safe + # window filter, so flat articles can't leak future news (#1007). + data = _extract_article_data(article) + if not _in_news_window(data["pub_date"], start_dt, curr_dt): + continue + news_str += f"### {data['title']} (source: {data['publisher']})\n" + if data["summary"]: + news_str += f"{data['summary']}\n" + if data["link"]: + news_str += f"Link: {data['link']}\n" news_str += "\n" + kept += 1 + + # All candidates fell outside the window -> say so rather than return an + # empty-bodied report (#993). + if kept == 0: + return f"No global news found between {start_date} and {curr_date}" return f"## Global Market News, from {start_date} to {curr_date}:\n\n{news_str}"