fix(data): keep future/undated news out of historical windows

The yfinance news date filter only ran when an article had a parsed date, so
flat-format and undated articles bypassed it and leaked future news into
historical/backtest runs. Parse the flat providerPublishTime, apply one
look-ahead-safe window rule across ticker and global news (undated kept only
when the window reaches the present), and return an informative message when
everything is filtered out.
This commit is contained in:
Yijia-Xiao
2026-06-13 21:54:07 +00:00
parent e4be7cc5a3
commit 0c1231a405
2 changed files with 125 additions and 30 deletions

View File

@@ -0,0 +1,80 @@
"""yfinance news must not leak future-dated (or undated, in a backtest) articles
into a historical window.
Regressions for #992 (flat articles bypassed the date filter), #1007 (global
news injected future articles), #993 (empty-after-filter returned a blank body).
"""
import time
from datetime import datetime
from unittest import mock
import pytest
import tradingagents.dataflows.yfinance_news as ynews
def _epoch(date_str):
return int(time.mktime(datetime.strptime(date_str, "%Y-%m-%d").timetuple()))
@pytest.mark.unit
def test_flat_article_publish_time_is_parsed():
# #992: flat articles now carry a pub_date (was always None -> unfilterable).
data = ynews._extract_article_data(
{"title": "X", "publisher": "P", "link": "l", "providerPublishTime": _epoch("2025-05-09")}
)
assert data["pub_date"] is not None
assert data["pub_date"].strftime("%Y-%m-%d") == "2025-05-09"
@pytest.mark.unit
def test_window_excludes_future_and_undated_in_backtest():
start = datetime(2025, 5, 1)
end = datetime(2025, 5, 9) # historical window (well in the past)
inside = datetime(2025, 5, 5)
future = datetime(2025, 6, 1)
assert ynews._in_news_window(inside, start, end) is True
assert ynews._in_news_window(future, start, end) is False # look-ahead blocked
assert ynews._in_news_window(None, start, end) is False # undated -> excluded in backtest
@pytest.mark.unit
def test_window_keeps_undated_in_live_window():
# Live window (reaches today): undated articles can't be "future", so keep them.
start = datetime.now()
end = datetime.now()
assert ynews._in_news_window(None, start, end) is True
@pytest.mark.unit
def test_global_news_future_flat_article_excluded(monkeypatch):
# #1007: a flat, future-dated global article must not appear in a historical run.
future_article = {"title": "FUTURE EVENT", "publisher": "P", "link": "l",
"providerPublishTime": _epoch("2025-06-01")}
past_article = {"title": "PAST EVENT", "publisher": "P", "link": "l",
"providerPublishTime": _epoch("2025-05-05")}
class FakeSearch:
def __init__(self, *a, **k):
self.news = [future_article, past_article]
monkeypatch.setattr(ynews.yf, "Search", FakeSearch)
out = ynews.get_global_news_yfinance("2025-05-09", look_back_days=7, limit=10)
assert "PAST EVENT" in out
assert "FUTURE EVENT" not in out # #1007
@pytest.mark.unit
def test_global_news_empty_after_filter_is_informative(monkeypatch):
# #993: everything filtered out -> a clear message, not a blank-bodied report.
only_future = {"title": "FUTURE", "publisher": "P", "link": "l",
"providerPublishTime": _epoch("2025-06-01")}
class FakeSearch:
def __init__(self, *a, **k):
self.news = [only_future]
monkeypatch.setattr(ynews.yf, "Search", FakeSearch)
out = ynews.get_global_news_yfinance("2025-05-09", look_back_days=7, limit=10)
assert "No global news found" in out
assert "###" not in out # no empty article body

View File

@@ -41,16 +41,39 @@ def _extract_article_data(article: dict) -> dict:
"pub_date": pub_date,
}
else:
# Fallback for flat structure
# Fallback for flat structure. Parse the epoch publish time so flat
# articles are date-filterable too (otherwise they bypass the
# historical window and leak future news, #992/#1007).
pub_date = None
ts = article.get("providerPublishTime")
if ts:
try:
pub_date = datetime.fromtimestamp(ts)
except (ValueError, OSError, TypeError):
pass
return {
"title": article.get("title", "No title"),
"summary": article.get("summary", ""),
"publisher": article.get("publisher", "Unknown"),
"link": article.get("link", ""),
"pub_date": None,
"pub_date": pub_date,
}
def _in_news_window(pub_date, start_dt, end_dt) -> bool:
"""Whether an article belongs in the [start_dt, end_dt] window.
Dated articles are kept only if they fall in the window. An undated article
is kept only when the window reaches the present (live run) — in a
historical/backtest window it's excluded, since we can't prove it isn't
future news (look-ahead safety, #992/#1007).
"""
if pub_date is not None:
naive = pub_date.replace(tzinfo=None) if hasattr(pub_date, "replace") else pub_date
return start_dt <= naive <= end_dt + relativedelta(days=1)
return end_dt >= datetime.now() - relativedelta(days=1)
def get_news_yfinance(
ticker: str,
start_date: str,
@@ -85,10 +108,8 @@ def get_news_yfinance(
for article in news:
data = _extract_article_data(article)
# Filter by date if publish time is available
if data["pub_date"]:
pub_date_naive = data["pub_date"].replace(tzinfo=None)
if not (start_dt <= pub_date_naive <= end_dt + relativedelta(days=1)):
# Keep only articles within the requested window (look-ahead safe).
if not _in_news_window(data["pub_date"], start_dt, end_dt):
continue
news_str += f"### {data['title']} (source: {data['publisher']})\n"
@@ -170,31 +191,25 @@ def get_global_news_yfinance(
start_date = start_dt.strftime("%Y-%m-%d")
news_str = ""
kept = 0
for article in all_news[:limit]:
# Handle both flat and nested structures
if "content" in article:
# Extract uniformly (flat + nested) and apply the same look-ahead-safe
# window filter, so flat articles can't leak future news (#1007).
data = _extract_article_data(article)
# Skip articles published after curr_date (look-ahead guard)
if data.get("pub_date"):
pub_naive = data["pub_date"].replace(tzinfo=None) if hasattr(data["pub_date"], "replace") else data["pub_date"]
if pub_naive > curr_dt + relativedelta(days=1):
if not _in_news_window(data["pub_date"], start_dt, curr_dt):
continue
title = data["title"]
publisher = data["publisher"]
link = data["link"]
summary = data["summary"]
else:
title = article.get("title", "No title")
publisher = article.get("publisher", "Unknown")
link = article.get("link", "")
summary = ""
news_str += f"### {title} (source: {publisher})\n"
if summary:
news_str += f"{summary}\n"
if link:
news_str += f"Link: {link}\n"
news_str += f"### {data['title']} (source: {data['publisher']})\n"
if data["summary"]:
news_str += f"{data['summary']}\n"
if data["link"]:
news_str += f"Link: {data['link']}\n"
news_str += "\n"
kept += 1
# All candidates fell outside the window -> say so rather than return an
# empty-bodied report (#993).
if kept == 0:
return f"No global news found between {start_date} and {curr_date}"
return f"## Global Market News, from {start_date} to {curr_date}:\n\n{news_str}"