改用 .NET System.Speech 实现 Win11 本地 TTS 语音播放

This commit is contained in:
2026-05-11 13:34:03 +08:00
parent ca462e290a
commit fe360fad3c
4 changed files with 67 additions and 96 deletions
+3 -5
View File
@@ -2,12 +2,10 @@
WECOM_BOT_ID=your_bot_id_here
WECOM_BOT_SECRET=your_bot_secret_here
# Xiaomi TTS
XIAOMI_USER_ID=1136458602
XIAOMI_TOKEN_PATH=.mi.token
XIAOMI_SPEAKER_DID=3ba2c1e8-d8cb-45c5-b88a-15624e7a02f3
# Windows 11 Local TTS (.NET System.Speech via PowerShell)
TTS_VOICE_NAME=
TTS_RATE=0
# TTS Behavior
TTS_ENABLED=true
TTS_MAX_TEXT_LENGTH=500
TTS_TIMEOUT_SECONDS=15
+56 -78
View File
@@ -1,76 +1,42 @@
import asyncio
import json
import logging
import threading
from pathlib import Path
from typing import Tuple, Any, Dict
from aiohttp import ClientSession
from miservice import MiAccount, MiNAService, MiTokenStore
import logging, subprocess
from typing import Tuple, Any, Dict, List
import config
logger = logging.getLogger(__name__)
class SafeTokenStore(MiTokenStore):
"""Wraps MiTokenStore to never lose passToken on auth failure."""
def __init__(self, token_path):
super().__init__(token_path)
self._saved_pass_token = ""
self._load_backup()
def _load_backup(self):
path = Path(self.token_path)
backup = Path(str(path) + ".backup")
if backup.exists():
def _run_ps(commands, timeout=60):
script = "; ".join(commands)
try:
data = json.loads(backup.read_text("utf-8"))
self._saved_pass_token = data.get("passToken", "")
except Exception:
pass
def _save_backup(self, token):
path = Path(self.token_path)
backup = Path(str(path) + ".backup")
try:
backup.write_text(json.dumps(token, ensure_ascii=False, indent=2), encoding="utf-8")
except Exception:
pass
def save_token(self, token=None):
if token and token.get("passToken"):
self._saved_pass_token = token["passToken"]
self._save_backup(token)
elif token is None and self._saved_pass_token:
# miservice is trying to delete token after auth failure
# Don't let it — restore from backup
logger.warning("miservice tried to wipe token, restoring passToken...")
return
super().save_token(token)
def _run_async_in_thread(coro, timeout: float = 15.0):
result = None
error = None
def _target():
nonlocal result, error
loop = asyncio.new_event_loop()
asyncio.set_event_loop(loop)
try:
result = loop.run_until_complete(coro)
p = subprocess.run(
["powershell.exe", "-NoProfile", "-NonInteractive", "-Command", script],
capture_output=True, text=True, timeout=timeout)
return p.returncode, p.stdout.strip()
except subprocess.TimeoutExpired:
return -1, "timeout"
except Exception as e:
error = e
finally:
loop.close()
return -1, str(e)
t = threading.Thread(target=_target)
t.start()
t.join(timeout=timeout)
if error:
raise error
def list_voices() -> List[Dict[str, str]]:
cmds = [
"Add-Type -AssemblyName System.Speech",
"$s = New-Object System.Speech.Synthesis.SpeechSynthesizer",
"foreach ($v in $s.GetInstalledVoices()) {",
" $i = $v.VoiceInfo",
' Write-Host ("VOICE:" + $i.Name + "|" + $i.Description + "|" + $i.Culture + "|" + $i.Gender + "|" + $i.Age)',
"}",
"$s.Dispose()",
]
code, out = _run_ps(cmds)
result = []
for line in out.splitlines():
if line.startswith("VOICE:"):
parts = line[6:].strip().split("|")
if len(parts) >= 5:
result.append({"name": parts[0].strip(), "description": parts[1].strip(),
"culture": parts[2].strip(), "gender": parts[3].strip(),
"age": parts[4].strip()})
return result
@@ -79,23 +45,35 @@ def speak(text: str) -> Tuple[bool, Dict[str, Any]]:
logger.info("TTS disabled, skipping: %s", text)
return True, {"skipped": True}
text = text[: config.TTS_MAX_TEXT_LENGTH].strip()
text = text[:config.TTS_MAX_TEXT_LENGTH].strip()
if not text:
return False, {"error": "empty text after truncation"}
async def _tts():
token_store = SafeTokenStore(config.XIAOMI_TOKEN_PATH)
async with ClientSession() as session:
account = MiAccount(
session, config.XIAOMI_USER_ID, None, token_store
)
mina = MiNAService(account)
return await mina.text_to_speech(config.XIAOMI_SPEAKER_DID, text)
safe = text.replace(chr(34), chr(34) + chr(34))
vname = (config.TTS_VOICE_NAME or "").replace(chr(34), chr(34) + chr(34))
cmds = [
"Add-Type -AssemblyName System.Speech",
"$s = New-Object System.Speech.Synthesis.SpeechSynthesizer",
]
if vname:
cmds += [
"foreach ($v in $s.GetInstalledVoices()) {",
' if ($v.VoiceInfo.Name -like "*' + vname + '*") { $s.SelectVoice($v.VoiceInfo.Name); break }',
"}",
]
cmds += [
"$s.Rate = " + str(config.TTS_RATE),
"$s.Volume = 100",
'$s.Speak("' + safe + '")',
"$s.Dispose()",
]
try:
result = _run_async_in_thread(_tts(), timeout=config.TTS_TIMEOUT_SECONDS)
ok = isinstance(result, dict) and result.get("code") == 0
return ok, result or {}
code, out = _run_ps(cmds)
if code != 0:
return False, {"error": f"TTS failed: {out}"}
return True, {"spoken": True}
except Exception as e:
logger.exception("TTS call failed")
logger.exception("TTS failed")
return False, {"error": str(e)}
+4 -9
View File
@@ -1,4 +1,4 @@
import os
import os
from pathlib import Path
from dotenv import load_dotenv
@@ -29,15 +29,10 @@ def _env_int(key: str, default: int) -> int:
WECOM_BOT_ID = _env("WECOM_BOT_ID")
WECOM_BOT_SECRET = _env("WECOM_BOT_SECRET")
# Xiaomi TTS
XIAOMI_USER_ID = _env("XIAOMI_USER_ID", "1136458602")
XIAOMI_TOKEN_PATH = _env(
"XIAOMI_TOKEN_PATH",
str(Path(__file__).resolve().parent / ".mi.token"),
)
XIAOMI_SPEAKER_DID = _env("XIAOMI_SPEAKER_DID", "3ba2c1e8-d8cb-45c5-b88a-15624e7a02f3")
# Windows Local TTS
TTS_VOICE_NAME = _env("TTS_VOICE_NAME", "") # empty = system default voice
TTS_RATE = _env_int("TTS_RATE", 0) # SAPI rate: -10 (slowest) to 10 (fastest), default 0
# TTS
TTS_ENABLED = _env_bool("TTS_ENABLED", True)
TTS_MAX_TEXT_LENGTH = _env_int("TTS_MAX_TEXT_LENGTH", 500)
TTS_TIMEOUT_SECONDS = _env_int("TTS_TIMEOUT_SECONDS", 15)
+2 -2
View File
@@ -1,6 +1,6 @@
websockets>=13.0
websockets>=13.0
python-dotenv>=1.0.0
miservice_fork>=2.9.0
pywin32>=311
aiohttp>=3.9.0
pytest>=8.0.0
pytest-asyncio>=0.23.0