mirror of
https://github.com/TauricResearch/TradingAgents.git
synced 2026-06-16 21:06:15 +03:00
fix(data): keep future/undated news out of historical windows
The yfinance news date filter only ran when an article had a parsed date, so flat-format and undated articles bypassed it and leaked future news into historical/backtest runs. Parse the flat providerPublishTime, apply one look-ahead-safe window rule across ticker and global news (undated kept only when the window reaches the present), and return an informative message when everything is filtered out.
This commit is contained in:
80
tests/test_news_lookahead.py
Normal file
80
tests/test_news_lookahead.py
Normal file
@@ -0,0 +1,80 @@
|
||||
"""yfinance news must not leak future-dated (or undated, in a backtest) articles
|
||||
into a historical window.
|
||||
|
||||
Regressions for #992 (flat articles bypassed the date filter), #1007 (global
|
||||
news injected future articles), #993 (empty-after-filter returned a blank body).
|
||||
"""
|
||||
import time
|
||||
from datetime import datetime
|
||||
from unittest import mock
|
||||
|
||||
import pytest
|
||||
|
||||
import tradingagents.dataflows.yfinance_news as ynews
|
||||
|
||||
|
||||
def _epoch(date_str):
|
||||
return int(time.mktime(datetime.strptime(date_str, "%Y-%m-%d").timetuple()))
|
||||
|
||||
|
||||
@pytest.mark.unit
|
||||
def test_flat_article_publish_time_is_parsed():
|
||||
# #992: flat articles now carry a pub_date (was always None -> unfilterable).
|
||||
data = ynews._extract_article_data(
|
||||
{"title": "X", "publisher": "P", "link": "l", "providerPublishTime": _epoch("2025-05-09")}
|
||||
)
|
||||
assert data["pub_date"] is not None
|
||||
assert data["pub_date"].strftime("%Y-%m-%d") == "2025-05-09"
|
||||
|
||||
|
||||
@pytest.mark.unit
|
||||
def test_window_excludes_future_and_undated_in_backtest():
|
||||
start = datetime(2025, 5, 1)
|
||||
end = datetime(2025, 5, 9) # historical window (well in the past)
|
||||
inside = datetime(2025, 5, 5)
|
||||
future = datetime(2025, 6, 1)
|
||||
assert ynews._in_news_window(inside, start, end) is True
|
||||
assert ynews._in_news_window(future, start, end) is False # look-ahead blocked
|
||||
assert ynews._in_news_window(None, start, end) is False # undated -> excluded in backtest
|
||||
|
||||
|
||||
@pytest.mark.unit
|
||||
def test_window_keeps_undated_in_live_window():
|
||||
# Live window (reaches today): undated articles can't be "future", so keep them.
|
||||
start = datetime.now()
|
||||
end = datetime.now()
|
||||
assert ynews._in_news_window(None, start, end) is True
|
||||
|
||||
|
||||
@pytest.mark.unit
|
||||
def test_global_news_future_flat_article_excluded(monkeypatch):
|
||||
# #1007: a flat, future-dated global article must not appear in a historical run.
|
||||
future_article = {"title": "FUTURE EVENT", "publisher": "P", "link": "l",
|
||||
"providerPublishTime": _epoch("2025-06-01")}
|
||||
past_article = {"title": "PAST EVENT", "publisher": "P", "link": "l",
|
||||
"providerPublishTime": _epoch("2025-05-05")}
|
||||
|
||||
class FakeSearch:
|
||||
def __init__(self, *a, **k):
|
||||
self.news = [future_article, past_article]
|
||||
|
||||
monkeypatch.setattr(ynews.yf, "Search", FakeSearch)
|
||||
out = ynews.get_global_news_yfinance("2025-05-09", look_back_days=7, limit=10)
|
||||
assert "PAST EVENT" in out
|
||||
assert "FUTURE EVENT" not in out # #1007
|
||||
|
||||
|
||||
@pytest.mark.unit
|
||||
def test_global_news_empty_after_filter_is_informative(monkeypatch):
|
||||
# #993: everything filtered out -> a clear message, not a blank-bodied report.
|
||||
only_future = {"title": "FUTURE", "publisher": "P", "link": "l",
|
||||
"providerPublishTime": _epoch("2025-06-01")}
|
||||
|
||||
class FakeSearch:
|
||||
def __init__(self, *a, **k):
|
||||
self.news = [only_future]
|
||||
|
||||
monkeypatch.setattr(ynews.yf, "Search", FakeSearch)
|
||||
out = ynews.get_global_news_yfinance("2025-05-09", look_back_days=7, limit=10)
|
||||
assert "No global news found" in out
|
||||
assert "###" not in out # no empty article body
|
||||
@@ -41,16 +41,39 @@ def _extract_article_data(article: dict) -> dict:
|
||||
"pub_date": pub_date,
|
||||
}
|
||||
else:
|
||||
# Fallback for flat structure
|
||||
# Fallback for flat structure. Parse the epoch publish time so flat
|
||||
# articles are date-filterable too (otherwise they bypass the
|
||||
# historical window and leak future news, #992/#1007).
|
||||
pub_date = None
|
||||
ts = article.get("providerPublishTime")
|
||||
if ts:
|
||||
try:
|
||||
pub_date = datetime.fromtimestamp(ts)
|
||||
except (ValueError, OSError, TypeError):
|
||||
pass
|
||||
return {
|
||||
"title": article.get("title", "No title"),
|
||||
"summary": article.get("summary", ""),
|
||||
"publisher": article.get("publisher", "Unknown"),
|
||||
"link": article.get("link", ""),
|
||||
"pub_date": None,
|
||||
"pub_date": pub_date,
|
||||
}
|
||||
|
||||
|
||||
def _in_news_window(pub_date, start_dt, end_dt) -> bool:
|
||||
"""Whether an article belongs in the [start_dt, end_dt] window.
|
||||
|
||||
Dated articles are kept only if they fall in the window. An undated article
|
||||
is kept only when the window reaches the present (live run) — in a
|
||||
historical/backtest window it's excluded, since we can't prove it isn't
|
||||
future news (look-ahead safety, #992/#1007).
|
||||
"""
|
||||
if pub_date is not None:
|
||||
naive = pub_date.replace(tzinfo=None) if hasattr(pub_date, "replace") else pub_date
|
||||
return start_dt <= naive <= end_dt + relativedelta(days=1)
|
||||
return end_dt >= datetime.now() - relativedelta(days=1)
|
||||
|
||||
|
||||
def get_news_yfinance(
|
||||
ticker: str,
|
||||
start_date: str,
|
||||
@@ -85,11 +108,9 @@ def get_news_yfinance(
|
||||
for article in news:
|
||||
data = _extract_article_data(article)
|
||||
|
||||
# Filter by date if publish time is available
|
||||
if data["pub_date"]:
|
||||
pub_date_naive = data["pub_date"].replace(tzinfo=None)
|
||||
if not (start_dt <= pub_date_naive <= end_dt + relativedelta(days=1)):
|
||||
continue
|
||||
# Keep only articles within the requested window (look-ahead safe).
|
||||
if not _in_news_window(data["pub_date"], start_dt, end_dt):
|
||||
continue
|
||||
|
||||
news_str += f"### {data['title']} (source: {data['publisher']})\n"
|
||||
if data["summary"]:
|
||||
@@ -170,31 +191,25 @@ def get_global_news_yfinance(
|
||||
start_date = start_dt.strftime("%Y-%m-%d")
|
||||
|
||||
news_str = ""
|
||||
kept = 0
|
||||
for article in all_news[:limit]:
|
||||
# Handle both flat and nested structures
|
||||
if "content" in article:
|
||||
data = _extract_article_data(article)
|
||||
# Skip articles published after curr_date (look-ahead guard)
|
||||
if data.get("pub_date"):
|
||||
pub_naive = data["pub_date"].replace(tzinfo=None) if hasattr(data["pub_date"], "replace") else data["pub_date"]
|
||||
if pub_naive > curr_dt + relativedelta(days=1):
|
||||
continue
|
||||
title = data["title"]
|
||||
publisher = data["publisher"]
|
||||
link = data["link"]
|
||||
summary = data["summary"]
|
||||
else:
|
||||
title = article.get("title", "No title")
|
||||
publisher = article.get("publisher", "Unknown")
|
||||
link = article.get("link", "")
|
||||
summary = ""
|
||||
|
||||
news_str += f"### {title} (source: {publisher})\n"
|
||||
if summary:
|
||||
news_str += f"{summary}\n"
|
||||
if link:
|
||||
news_str += f"Link: {link}\n"
|
||||
# Extract uniformly (flat + nested) and apply the same look-ahead-safe
|
||||
# window filter, so flat articles can't leak future news (#1007).
|
||||
data = _extract_article_data(article)
|
||||
if not _in_news_window(data["pub_date"], start_dt, curr_dt):
|
||||
continue
|
||||
news_str += f"### {data['title']} (source: {data['publisher']})\n"
|
||||
if data["summary"]:
|
||||
news_str += f"{data['summary']}\n"
|
||||
if data["link"]:
|
||||
news_str += f"Link: {data['link']}\n"
|
||||
news_str += "\n"
|
||||
kept += 1
|
||||
|
||||
# All candidates fell outside the window -> say so rather than return an
|
||||
# empty-bodied report (#993).
|
||||
if kept == 0:
|
||||
return f"No global news found between {start_date} and {curr_date}"
|
||||
|
||||
return f"## Global Market News, from {start_date} to {curr_date}:\n\n{news_str}"
|
||||
|
||||
|
||||
Reference in New Issue
Block a user