改用 .NET System.Speech 实现 Win11 本地 TTS 语音播放

2026-05-11 13:34:03 +08:00
parent ca462e290a
commit fe360fad3c
4 changed files with 67 additions and 96 deletions
@@ -2,12 +2,10 @@
 WECOM_BOT_ID=your_bot_id_here
 WECOM_BOT_SECRET=your_bot_secret_here

-# Xiaomi TTS
-XIAOMI_USER_ID=1136458602
-XIAOMI_TOKEN_PATH=.mi.token
-XIAOMI_SPEAKER_DID=3ba2c1e8-d8cb-45c5-b88a-15624e7a02f3
+# Windows 11 Local TTS (.NET System.Speech via PowerShell)
+TTS_VOICE_NAME=
+TTS_RATE=0

 # TTS Behavior
 TTS_ENABLED=true
 TTS_MAX_TEXT_LENGTH=500
-TTS_TIMEOUT_SECONDS=15
@@ -1,76 +1,42 @@
-import asyncio
-import json
-import logging
-import threading
-from pathlib import Path
-from typing import Tuple, Any, Dict
-
-from aiohttp import ClientSession
-from miservice import MiAccount, MiNAService, MiTokenStore
-
+import logging, subprocess
+from typing import Tuple, Any, Dict, List
 import config

 logger = logging.getLogger(__name__)


-class SafeTokenStore(MiTokenStore):
-    """Wraps MiTokenStore to never lose passToken on auth failure."""
-
-    def __init__(self, token_path):
-        super().__init__(token_path)
-        self._saved_pass_token = ""
-        self._load_backup()
-
-    def _load_backup(self):
-        path = Path(self.token_path)
-        backup = Path(str(path) + ".backup")
-        if backup.exists():
+def _run_ps(commands, timeout=60):
+    script = "; ".join(commands)
    try:
-                data = json.loads(backup.read_text("utf-8"))
-                self._saved_pass_token = data.get("passToken", "")
-            except Exception:
-                pass
-
-    def _save_backup(self, token):
-        path = Path(self.token_path)
-        backup = Path(str(path) + ".backup")
-        try:
-            backup.write_text(json.dumps(token, ensure_ascii=False, indent=2), encoding="utf-8")
-        except Exception:
-            pass
-
-    def save_token(self, token=None):
-        if token and token.get("passToken"):
-            self._saved_pass_token = token["passToken"]
-            self._save_backup(token)
-        elif token is None and self._saved_pass_token:
-            # miservice is trying to delete token after auth failure
-            # Don't let it — restore from backup
-            logger.warning("miservice tried to wipe token, restoring passToken...")
-            return
-        super().save_token(token)
-
-
-def _run_async_in_thread(coro, timeout: float = 15.0):
-    result = None
-    error = None
-
-    def _target():
-        nonlocal result, error
-        loop = asyncio.new_event_loop()
-        asyncio.set_event_loop(loop)
-        try:
-            result = loop.run_until_complete(coro)
+        p = subprocess.run(
+            ["powershell.exe", "-NoProfile", "-NonInteractive", "-Command", script],
+            capture_output=True, text=True, timeout=timeout)
+        return p.returncode, p.stdout.strip()
+    except subprocess.TimeoutExpired:
+        return -1, "timeout"
    except Exception as e:
-            error = e
-        finally:
-            loop.close()
+        return -1, str(e)

-    t = threading.Thread(target=_target)
-    t.start()
-    t.join(timeout=timeout)
-    if error:
-        raise error
+
+def list_voices() -> List[Dict[str, str]]:
+    cmds = [
+        "Add-Type -AssemblyName System.Speech",
+        "$s = New-Object System.Speech.Synthesis.SpeechSynthesizer",
+        "foreach ($v in $s.GetInstalledVoices()) {",
+        "  $i = $v.VoiceInfo",
+        '  Write-Host ("VOICE:" + $i.Name + "|" + $i.Description + "|" + $i.Culture + "|" + $i.Gender + "|" + $i.Age)',
+        "}",
+        "$s.Dispose()",
+    ]
+    code, out = _run_ps(cmds)
+    result = []
+    for line in out.splitlines():
+        if line.startswith("VOICE:"):
+            parts = line[6:].strip().split("|")
+            if len(parts) >= 5:
+                result.append({"name": parts[0].strip(), "description": parts[1].strip(),
+                               "culture": parts[2].strip(), "gender": parts[3].strip(),
+                               "age": parts[4].strip()})
    return result


@@ -79,23 +45,35 @@ def speak(text: str) -> Tuple[bool, Dict[str, Any]]:
        logger.info("TTS disabled, skipping: %s", text)
        return True, {"skipped": True}

-    text = text[: config.TTS_MAX_TEXT_LENGTH].strip()
+    text = text[:config.TTS_MAX_TEXT_LENGTH].strip()
    if not text:
        return False, {"error": "empty text after truncation"}

-    async def _tts():
-        token_store = SafeTokenStore(config.XIAOMI_TOKEN_PATH)
-        async with ClientSession() as session:
-            account = MiAccount(
-                session, config.XIAOMI_USER_ID, None, token_store
-            )
-            mina = MiNAService(account)
-            return await mina.text_to_speech(config.XIAOMI_SPEAKER_DID, text)
+    safe = text.replace(chr(34), chr(34) + chr(34))
+    vname = (config.TTS_VOICE_NAME or "").replace(chr(34), chr(34) + chr(34))
+
+    cmds = [
+        "Add-Type -AssemblyName System.Speech",
+        "$s = New-Object System.Speech.Synthesis.SpeechSynthesizer",
+    ]
+    if vname:
+        cmds += [
+            "foreach ($v in $s.GetInstalledVoices()) {",
+            '  if ($v.VoiceInfo.Name -like "*' + vname + '*") { $s.SelectVoice($v.VoiceInfo.Name); break }',
+            "}",
+        ]
+    cmds += [
+        "$s.Rate = " + str(config.TTS_RATE),
+        "$s.Volume = 100",
+        '$s.Speak("' + safe + '")',
+        "$s.Dispose()",
+    ]

    try:
-        result = _run_async_in_thread(_tts(), timeout=config.TTS_TIMEOUT_SECONDS)
-        ok = isinstance(result, dict) and result.get("code") == 0
-        return ok, result or {}
+        code, out = _run_ps(cmds)
+        if code != 0:
+            return False, {"error": f"TTS failed: {out}"}
+        return True, {"spoken": True}
    except Exception as e:
-        logger.exception("TTS call failed")
+        logger.exception("TTS failed")
        return False, {"error": str(e)}
@@ -1,4 +1,4 @@
-import os
+import os
 from pathlib import Path
 from dotenv import load_dotenv

@@ -29,15 +29,10 @@ def _env_int(key: str, default: int) -> int:
 WECOM_BOT_ID = _env("WECOM_BOT_ID")
 WECOM_BOT_SECRET = _env("WECOM_BOT_SECRET")

-# Xiaomi TTS
-XIAOMI_USER_ID = _env("XIAOMI_USER_ID", "1136458602")
-XIAOMI_TOKEN_PATH = _env(
-    "XIAOMI_TOKEN_PATH",
-    str(Path(__file__).resolve().parent / ".mi.token"),
-)
-XIAOMI_SPEAKER_DID = _env("XIAOMI_SPEAKER_DID", "3ba2c1e8-d8cb-45c5-b88a-15624e7a02f3")
+# Windows Local TTS
+TTS_VOICE_NAME = _env("TTS_VOICE_NAME", "")  # empty = system default voice
+TTS_RATE = _env_int("TTS_RATE", 0)            # SAPI rate: -10 (slowest) to 10 (fastest), default 0

 # TTS
 TTS_ENABLED = _env_bool("TTS_ENABLED", True)
 TTS_MAX_TEXT_LENGTH = _env_int("TTS_MAX_TEXT_LENGTH", 500)
-TTS_TIMEOUT_SECONDS = _env_int("TTS_TIMEOUT_SECONDS", 15)
@@ -1,6 +1,6 @@
-websockets>=13.0
+websockets>=13.0
 python-dotenv>=1.0.0
-miservice_fork>=2.9.0
+pywin32>=311
 aiohttp>=3.9.0
 pytest>=8.0.0
 pytest-asyncio>=0.23.0