From 7bb16c5daa31eac602723526865d46b7cd35f04c Mon Sep 17 00:00:00 2001
From: Yijia-Xiao <yijia-xiao@outlook.com>
Date: Sun, 21 Jun 2026 21:03:05 +0000
Subject: [PATCH] chore(models): retire deprecated models, simplify thinking
 config

Trim each provider to current-generation models and drop the special-casing
they required:

- OpenAI: remove gpt-4.1 (deprecated; the only non-reasoning model).
- Anthropic: remove Claude Sonnet 4.5 (legacy; the only Sonnet that 400s on effort).
- Google: remove the Gemini 2.5 line (superseded by 3.x).
- Gemini client: drop the integer thinking_budget mapping; 3.x takes the string
  thinking_level directly.

Effort/reasoning gates stay as defense in depth for custom model IDs. All kept
IDs verified against live APIs.
---
 tests/test_google_api_key.py               |  2 +-
 tests/test_google_thinking_level.py        | 46 ++++++++++++++++++++++
 tradingagents/llm_clients/google_client.py | 20 ++++------
 tradingagents/llm_clients/model_catalog.py |  8 +---
 4 files changed, 55 insertions(+), 21 deletions(-)
 create mode 100644 tests/test_google_thinking_level.py

diff --git a/tests/test_google_api_key.py b/tests/test_google_api_key.py
index 53376ab10..9bad87cea 100644
--- a/tests/test_google_api_key.py
+++ b/tests/test_google_api_key.py
@@ -21,7 +21,7 @@ class TestGoogleApiKeyStandardization(unittest.TestCase):
         for msg, kwargs, expected_key in test_cases:
             with self.subTest(msg=msg):
                 mock_chat.reset_mock()
-                client = GoogleClient("gemini-2.5-flash", **kwargs)
+                client = GoogleClient("gemini-3.5-flash", **kwargs)
                 client.get_llm()
                 call_kwargs = mock_chat.call_args[1]
                 self.assertEqual(call_kwargs.get("google_api_key"), expected_key)
diff --git a/tests/test_google_thinking_level.py b/tests/test_google_thinking_level.py
new file mode 100644
index 000000000..f24ceff2f
--- /dev/null
+++ b/tests/test_google_thinking_level.py
@@ -0,0 +1,46 @@
+"""Gemini thinking_level forwarding (Gemini 3.x).
+
+The catalog is Gemini 3.x only, which takes the string ``thinking_level``
+directly. Pro accepts low/high; Flash also accepts minimal/medium — an
+unsupported "minimal" on Pro is mapped to "low".
+"""
+
+from unittest import mock
+
+import pytest
+
+from tradingagents.llm_clients.google_client import GoogleClient
+
+
+def _captured_kwargs(model, **kwargs):
+    captured = {}
+    with mock.patch.object(
+        __import__("tradingagents.llm_clients.google_client", fromlist=["x"]),
+        "NormalizedChatGoogleGenerativeAI",
+        lambda **kw: captured.setdefault("kw", kw),
+    ):
+        GoogleClient(model, api_key="x", **kwargs).get_llm()
+    return captured["kw"]
+
+
+@pytest.mark.parametrize("level", ["minimal", "low", "medium", "high"])
+def test_flash_passes_thinking_level_through(level):
+    kw = _captured_kwargs("gemini-3.5-flash", thinking_level=level)
+    assert kw["thinking_level"] == level
+    assert "thinking_budget" not in kw  # the 2.5-era param is gone
+
+
+def test_pro_remaps_minimal_to_low():
+    kw = _captured_kwargs("gemini-3.1-pro-preview", thinking_level="minimal")
+    assert kw["thinking_level"] == "low"  # Pro doesn't accept "minimal"
+
+
+def test_pro_keeps_high():
+    kw = _captured_kwargs("gemini-3.1-pro-preview", thinking_level="high")
+    assert kw["thinking_level"] == "high"
+
+
+def test_no_thinking_level_is_omitted():
+    kw = _captured_kwargs("gemini-3.5-flash")
+    assert "thinking_level" not in kw
+    assert "thinking_budget" not in kw
diff --git a/tradingagents/llm_clients/google_client.py b/tradingagents/llm_clients/google_client.py
index df83b6cc8..93bb1d11a 100644
--- a/tradingagents/llm_clients/google_client.py
+++ b/tradingagents/llm_clients/google_client.py
@@ -40,21 +40,15 @@ class GoogleClient(BaseLLMClient):
         if google_api_key:
             llm_kwargs["google_api_key"] = google_api_key
 
-        # Map thinking_level to appropriate API param based on model
-        # Gemini 3 Pro: low, high
-        # Gemini 3 Flash: minimal, low, medium, high
-        # Gemini 2.5: thinking_budget (0=disable, -1=dynamic)
+        # Gemini 3.x takes the string ``thinking_level`` (the integer
+        # ``thinking_budget`` was for the now-retired 2.5 line). Pro accepts
+        # low/high; Flash also accepts minimal/medium — so map an unsupported
+        # "minimal" on Pro to the nearest level it does accept.
         thinking_level = self.kwargs.get("thinking_level")
         if thinking_level:
-            model_lower = self.model.lower()
-            if "gemini-3" in model_lower:
-                # Gemini 3 Pro doesn't support "minimal", use "low" instead
-                if "pro" in model_lower and thinking_level == "minimal":
-                    thinking_level = "low"
-                llm_kwargs["thinking_level"] = thinking_level
-            else:
-                # Gemini 2.5: map to thinking_budget
-                llm_kwargs["thinking_budget"] = -1 if thinking_level == "high" else 0
+            if "pro" in self.model.lower() and thinking_level == "minimal":
+                thinking_level = "low"
+            llm_kwargs["thinking_level"] = thinking_level
 
         return NormalizedChatGoogleGenerativeAI(**llm_kwargs)
 
diff --git a/tradingagents/llm_clients/model_catalog.py b/tradingagents/llm_clients/model_catalog.py
index 5dff0aa09..bcec44add 100644
--- a/tradingagents/llm_clients/model_catalog.py
+++ b/tradingagents/llm_clients/model_catalog.py
@@ -84,7 +84,6 @@ MODEL_OPTIONS: ProviderModeOptions = {
             ("GPT-5.4 Mini - Fast, strong coding and tool use", "gpt-5.4-mini"),
             ("GPT-5.4 Nano - Cheapest, high-volume tasks", "gpt-5.4-nano"),
             ("GPT-5.5 - Latest frontier, 1M context", "gpt-5.5"),
-            ("GPT-4.1 - Smartest non-reasoning model", "gpt-4.1"),
         ],
         "deep": [
             ("GPT-5.5 - Latest frontier, 1M context", "gpt-5.5"),
@@ -97,7 +96,6 @@ MODEL_OPTIONS: ProviderModeOptions = {
         "quick": [
             ("Claude Sonnet 4.6 - Best speed and intelligence balance", "claude-sonnet-4-6"),
             ("Claude Haiku 4.5 - Fastest with near-frontier intelligence", "claude-haiku-4-5"),
-            ("Claude Sonnet 4.5 - High-performance for agents and coding", "claude-sonnet-4-5"),
         ],
         "deep": [
             ("Claude Opus 4.8 - Latest frontier, agentic coding and reasoning", "claude-opus-4-8"),
@@ -109,15 +107,11 @@ MODEL_OPTIONS: ProviderModeOptions = {
     "google": {
         "quick": [
             ("Gemini 3.5 Flash - Latest, frontier agentic + coding (GA)", "gemini-3.5-flash"),
-            ("Gemini 3.1 Flash Lite - Most cost-efficient (GA)", "gemini-3.1-flash-lite"),
-            ("Gemini 2.5 Flash - Balanced, stable", "gemini-2.5-flash"),
-            ("Gemini 2.5 Flash Lite - Fast, low-cost", "gemini-2.5-flash-lite"),
+            ("Gemini 3.1 Flash Lite - Most cost-efficient", "gemini-3.1-flash-lite"),
         ],
         "deep": [
             ("Gemini 3.1 Pro - Reasoning-first, complex workflows (preview)", "gemini-3.1-pro-preview"),
             ("Gemini 3.5 Flash - Latest GA, strong agentic + coding", "gemini-3.5-flash"),
-            ("Gemini 2.5 Pro - Stable pro model", "gemini-2.5-pro"),
-            ("Gemini 2.5 Flash - Balanced, stable", "gemini-2.5-flash"),
         ],
     },
     "xai": {