mirror of
https://github.com/TauricResearch/TradingAgents.git
synced 2026-06-16 21:06:15 +03:00
fix(reddit): go RSS-first with 429 backoff and robust transport errors
The JSON search endpoint is reliably WAF-blocked (403) for public clients, so probing it on every call doubled request volume against Reddit's per-IP rate limit and tripped 429 on the RSS fallback, blanking the sentiment feed. Fetch the Atom/RSS feed directly (JSON kept as an opt-in path that still degrades to RSS on 403), back off once on a 429 honouring Retry-After, and pace requests a little wider. Also broaden the error handling to catch http.client chunked transfer errors (IncompleteRead/BadStatusLine) alongside OSError, which on their own slipped through and crashed the pipeline.
This commit is contained in:
@@ -1,7 +1,9 @@
|
||||
"""Tests for the Reddit RSS/Atom fallback when the JSON endpoint 403s (#862)."""
|
||||
"""Tests for the RSS-first Reddit fetcher, its 429 backoff, the opt-in JSON
|
||||
path's degradation (#862), and chunked-transfer error handling (#1024)."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import http.client
|
||||
from unittest.mock import patch
|
||||
from urllib.error import HTTPError
|
||||
|
||||
@@ -9,7 +11,6 @@ import pytest
|
||||
|
||||
from tradingagents.dataflows import reddit
|
||||
|
||||
|
||||
_SAMPLE_ATOM = """<?xml version="1.0" encoding="UTF-8"?>
|
||||
<feed xmlns="http://www.w3.org/2005/Atom">
|
||||
<entry>
|
||||
@@ -26,6 +27,30 @@ _SAMPLE_ATOM = """<?xml version="1.0" encoding="UTF-8"?>
|
||||
"""
|
||||
|
||||
|
||||
def _resp(read_fn):
|
||||
"""A minimal context-manager response whose read() runs ``read_fn``."""
|
||||
class _Resp:
|
||||
def __enter__(self_inner):
|
||||
return self_inner
|
||||
|
||||
def __exit__(self_inner, *a):
|
||||
return False
|
||||
|
||||
def read(self_inner):
|
||||
return read_fn()
|
||||
return _Resp()
|
||||
|
||||
|
||||
def _atom_resp():
|
||||
return _resp(lambda: _SAMPLE_ATOM.encode("utf-8"))
|
||||
|
||||
|
||||
def _raise(exc):
|
||||
def _r():
|
||||
raise exc
|
||||
return _resp(_r)
|
||||
|
||||
|
||||
@pytest.mark.unit
|
||||
class TestIsoToTimestamp:
|
||||
def test_parses_offset_and_z(self):
|
||||
@@ -48,19 +73,9 @@ class TestStripHtml:
|
||||
|
||||
|
||||
@pytest.mark.unit
|
||||
class TestRssFallbackParsing:
|
||||
def _patch_rss_response(self, xml_bytes):
|
||||
class _Resp:
|
||||
def __enter__(self_inner):
|
||||
return self_inner
|
||||
def __exit__(self_inner, *a):
|
||||
return False
|
||||
def read(self_inner):
|
||||
return xml_bytes
|
||||
return patch.object(reddit, "urlopen", return_value=_Resp())
|
||||
|
||||
class TestRssParsing:
|
||||
def test_parses_atom_entries(self):
|
||||
with self._patch_rss_response(_SAMPLE_ATOM.encode("utf-8")):
|
||||
with patch.object(reddit, "urlopen", return_value=_atom_resp()):
|
||||
posts = reddit._fetch_subreddit_rss("NVDA", "stocks", limit=5, timeout=5.0)
|
||||
assert len(posts) == 2
|
||||
assert posts[0]["title"] == "NVDA earnings beat, stock pops"
|
||||
@@ -71,21 +86,84 @@ class TestRssFallbackParsing:
|
||||
assert "datacenter unit" in posts[0]["selftext"]
|
||||
|
||||
def test_malformed_xml_fails_open(self):
|
||||
with self._patch_rss_response(b"<<not xml>>"):
|
||||
with patch.object(reddit, "urlopen", return_value=_resp(lambda: b"<<not xml>>")):
|
||||
assert reddit._fetch_subreddit_rss("NVDA", "stocks", 5, 5.0) == []
|
||||
|
||||
|
||||
@pytest.mark.unit
|
||||
class TestJsonFallsBackToRss:
|
||||
def test_403_triggers_rss(self):
|
||||
err = HTTPError("url", 403, "Blocked", {}, None)
|
||||
with patch.object(reddit, "urlopen", side_effect=err), \
|
||||
patch.object(reddit, "_fetch_subreddit_rss", return_value=[{"title": "x", "source": "rss", "score": None, "num_comments": None, "created_utc": None, "selftext": ""}]) as rss:
|
||||
class TestFetchSubredditIsRssFirst:
|
||||
"""The default per-subreddit fetch goes straight to RSS — it must not hit
|
||||
the WAF-blocked JSON endpoint, which only burned rate-limit budget."""
|
||||
|
||||
def test_delegates_to_rss_without_touching_json(self):
|
||||
sentinel = [{"title": "x", "source": "rss", "score": None,
|
||||
"num_comments": None, "created_utc": None, "selftext": ""}]
|
||||
with patch.object(reddit, "_fetch_subreddit_rss", return_value=sentinel) as rss, \
|
||||
patch.object(reddit, "urlopen",
|
||||
side_effect=AssertionError("JSON endpoint must not be called")):
|
||||
out = reddit._fetch_subreddit("NVDA", "stocks", 5, 5.0)
|
||||
rss.assert_called_once()
|
||||
assert out is sentinel
|
||||
|
||||
|
||||
@pytest.mark.unit
|
||||
class TestJsonPathFallsBackToRss:
|
||||
"""The opt-in JSON path still degrades to RSS on a 403 (kept for #862)."""
|
||||
|
||||
def test_403_triggers_rss(self):
|
||||
err = HTTPError("url", 403, "Blocked", {}, None)
|
||||
rss_posts = [{"title": "x", "source": "rss", "score": None,
|
||||
"num_comments": None, "created_utc": None, "selftext": ""}]
|
||||
with patch.object(reddit, "urlopen", side_effect=err), \
|
||||
patch.object(reddit, "_fetch_subreddit_rss", return_value=rss_posts) as rss:
|
||||
out = reddit._fetch_subreddit_json("NVDA", "stocks", 5, 5.0)
|
||||
rss.assert_called_once()
|
||||
assert out and out[0]["source"] == "rss"
|
||||
|
||||
|
||||
@pytest.mark.unit
|
||||
class TestRss429Backoff:
|
||||
def test_429_then_success_retries_once(self):
|
||||
err = HTTPError("url", 429, "Too Many Requests", {}, None)
|
||||
with patch.object(reddit, "urlopen", side_effect=[err, _atom_resp()]) as op, \
|
||||
patch.object(reddit.time, "sleep") as slept:
|
||||
posts = reddit._fetch_subreddit_rss("NVDA", "stocks", 5, 5.0)
|
||||
assert op.call_count == 2 # original + exactly one retry
|
||||
slept.assert_called_once() # backed off before retrying
|
||||
assert len(posts) == 2
|
||||
|
||||
def test_429_twice_gives_up_after_one_retry(self):
|
||||
err = HTTPError("url", 429, "Too Many Requests", {}, None)
|
||||
with patch.object(reddit, "urlopen", side_effect=[err, err]) as op, \
|
||||
patch.object(reddit.time, "sleep"):
|
||||
posts = reddit._fetch_subreddit_rss("NVDA", "stocks", 5, 5.0)
|
||||
assert op.call_count == 2 # one retry, then gives up cleanly
|
||||
assert posts == []
|
||||
|
||||
def test_retry_after_header_is_honoured(self):
|
||||
err = HTTPError("url", 429, "Too Many Requests", {"Retry-After": "12"}, None)
|
||||
with patch.object(reddit, "urlopen", side_effect=[err, _atom_resp()]), \
|
||||
patch.object(reddit.time, "sleep") as slept:
|
||||
reddit._fetch_subreddit_rss("NVDA", "stocks", 5, 5.0)
|
||||
slept.assert_called_once_with(12.0)
|
||||
|
||||
|
||||
@pytest.mark.unit
|
||||
class TestChunkedTransferErrorsHandled:
|
||||
"""IncompleteRead/RemoteDisconnected come from http.client and are NOT
|
||||
OSErrors, so they were previously uncaught and crashed the pipeline (#1024)."""
|
||||
|
||||
def test_rss_incomplete_read_degrades_to_empty(self):
|
||||
with patch.object(reddit, "urlopen", return_value=_raise(http.client.IncompleteRead(b""))):
|
||||
assert reddit._fetch_subreddit_rss("NVDA", "stocks", 5, 5.0) == []
|
||||
|
||||
def test_json_incomplete_read_falls_back_to_rss(self):
|
||||
with patch.object(reddit, "urlopen", return_value=_raise(http.client.IncompleteRead(b""))), \
|
||||
patch.object(reddit, "_fetch_subreddit_rss", return_value=[]) as rss:
|
||||
reddit._fetch_subreddit_json("NVDA", "stocks", 5, 5.0)
|
||||
rss.assert_called_once()
|
||||
|
||||
|
||||
@pytest.mark.unit
|
||||
class TestFormatterHandlesRssPosts:
|
||||
def test_rss_posts_omit_fake_counts_and_note_source(self):
|
||||
|
||||
Reference in New Issue
Block a user