Qwen3-TTS 開源語音生成模型

01 Apr, 2026

~~（這不是愚人節，開玩笑的發文喔！）~~

Qwen3-TTS 只需要一段 30 秒的語音樣本，就能複製你的聲音，用你的聲音說任何話。對我而言，它的好處有：

免費，
離線，
開原！

官方網站：https://github.com/QwenLM/Qwen3-TTS。

設定環境

1. 安裝 choco

　可參考終端機設定這篇喔。

2. 安裝 python 3.12 版

(1) 移除新版 python 。

choco uninstall python

(2) 安裝。

choco install python --version=3.12 --force

(3) 更新 python 的套件管理器 pip 。

python.exe -m pip install --upgrade pip

(4) 安裝所需套件

pip install torch numpy soundfile opencc-python-reimplemented qwen_tts

Python 程式碼分享

開發過程：https://notebooklm.google.com/notebook/4d57dfe9-7fa8-4ad6-bcbb-150e1749dedb

部屬檔案

檔案放在 C:\QW_Voice 資料夾裡，一個腳色建一個 {name}_QW 的資料夾，
⠀⠀其中 {name} 代表腳色名稱，也就是第一組程式碼，會詢問的 voice_name ，以及第二組程式碼，要寫在劇本中的腳色名稱（可以寫中文）！
資料夾裡面，至少要有：
- {name}_Voice.wav ：參考聲音，
- {name}_reftext.txt ：參考聲音逐字稿，
- {name}_target-text.txt ：生成音訊逐字稿內容！（第二個劇本的不用！）

例如要建立 NVDA 這個腳色，那麼在資料夾中要建立：

NVDA_QW 資料夾。
裡面放兩個檔案：

NVDA_Voice.wav ：音檔！
NVDA_reftext.txt ：音檔的逐字稿！

使用第一組程式碼：

需要在資料夾中，多提供 NVDA_target-text.txt ，寫入期待生成的文字內容！
生成後，音檔會出現在資料夾中！

使用第二組程式碼：在資料夾外面，提供 QW_Script.txt ，寫入劇本，劇本內就可以套用這個腳色！

生成單一腳色

import os
import torch
import numpy as np
import soundfile as sf
import datetime
import time
import re
from opencc import OpenCC
from qwen_tts import Qwen3TTSModel

# 1. 設定環境變數以優化 CPU 執行緒使用
os.environ["OMP_NUM_THREADS"] = "4"

# --- A. 使用者輸入與路徑設定 ---
voice_name = input("請輸入 voice_name: ").strip()
base_dir = r"C:\QW_Voice"
work_dir = os.path.join(base_dir, f"{voice_name}_QW")

# 設定安全字數上限（建議 100-150 字，以維持穩定性並兼顧語意連貫）
MAX_CHARS = 150 

timestamp = datetime.datetime.now().strftime("%Y-%m-%d,%H-%M-%S")
ref_audio_path = os.path.join(work_dir, f"{voice_name}_Voice.wav")
ref_text_file = os.path.join(work_dir, f"{voice_name}_reftext.txt")
target_text_file = os.path.join(work_dir, f"{voice_name}_target-text.txt")
save_pt_path = os.path.join(work_dir, f"{voice_name}_voice_print.pt")
output_wav_path = os.path.join(work_dir, f"{voice_name}_output_{timestamp}.wav")
simplified_text_path = os.path.join(work_dir, f"{voice_name}_target-simplified_{timestamp}.txt")

# --- B. 初始化模型 ---
print(f"\n--- 正在處理角色: {voice_name} ---")
print("正在載入 Qwen3-TTS 模型...")
model = Qwen3TTSModel.from_pretrained(
    "Qwen/Qwen3-TTS-12Hz-1.7B-Base", 
    dtype=torch.bfloat16,
    device_map="auto",
    offload_folder="offload"
)

# --- C. 聲紋邏輯判斷 ---
if os.path.exists(save_pt_path):
    print(f"偵測到現成聲紋檔案，直接跳至生成階段。")
    reusable_prompt = torch.load(save_pt_path, weights_only=False)
else:
    print(f"未發現聲紋檔，啟動聲紋提取流程...")
    with open(ref_text_file, "r", encoding="utf-8") as f:
        ref_text_content = f.read().strip()
    reusable_prompt = model.create_voice_clone_prompt(ref_audio=ref_audio_path, ref_text=ref_text_content)
    os.makedirs(work_dir, exist_ok=True)
    torch.save(reusable_prompt, save_pt_path)

# --- D. 目標文字處理 (簡繁轉換與智慧分段合併) ---
with open(target_text_file, "r", encoding="utf-8") as f:
    original_target_text = f.read().strip()

cc = OpenCC('t2s')
simplified_target_text = cc.convert(original_target_text)

# 步驟 1: 先按標點與換行初步拆分
raw_segments = re.split(r'([。！？?\n])', simplified_target_text)
initial_segments = []
temp_seg = ""
for s in raw_segments:
    if s in "。！？?\n":
        initial_segments.append(temp_seg + s)
        temp_seg = ""
    else:
        temp_seg = s
if temp_seg: initial_segments.append(temp_seg)

# 步驟 2: 貪婪合併 (Greedy Merging)
# 確保每一段盡量長以保留語意情緒，但不超過 MAX_CHARS
final_chunks = []
current_chunk = ""
for seg in initial_segments:
    if len(current_chunk) + len(seg) <= MAX_CHARS:
        current_chunk += seg
    else:
        if current_chunk: final_chunks.append(current_chunk.strip())
        current_chunk = seg
if current_chunk: final_chunks.append(current_chunk.strip())

with open(simplified_text_path, "w", encoding="utf-8") as f:
    f.write(simplified_target_text)
print(f"已將文字智慧合併為 {len(final_chunks)} 個長段落，優化情緒表達。")

# --- E. 逐段生成與動態進度顯示 ---
all_audio_segments = []
total_chunks = len(final_chunks)
total_start_time = time.time()

print(f"\n開始生成語音 (目標總段數: {total_chunks})...")

for i, text_chunk in enumerate(final_chunks):
    chunk_start_time = time.time()
    print(f"[{i+1}/{total_chunks}] 正在生成 ({len(text_chunk)}字): {text_chunk[:20]}...")
    
    # 執行生成 [2]
    audio_seg, sampling_rate = model.generate_voice_clone(
        text=text_chunk,
        voice_clone_prompt=reusable_prompt,
        max_new_tokens=2048 # 評估時使用的標準長度 [2]
    )
    
    if isinstance(audio_seg, list):
        audio_seg = np.array(audio_seg)
    elif torch.is_tensor(audio_seg):
        audio_seg = audio_seg.detach().cpu().numpy()
    
    all_audio_segments.append(audio_seg.astype(np.float32).flatten())
    
    # 計算進度與預估時間
    elapsed = time.time() - chunk_start_time
    remaining = total_chunks - (i + 1)
    eta = elapsed * remaining
    
    print(f"   -> 完成！本段耗時: {elapsed:.1f}s | 預估剩餘: {eta/60:.1f}分鐘")

# --- F. 拼接與輸出 ---
print("\n正在整合音軌...")
final_audio = np.concatenate(all_audio_segments)
sf.write(output_wav_path, final_audio, sampling_rate)

print(f"全部完成！總耗時: {(time.time() - total_start_time)/60:.1f} 分鐘")
print(f"檔案路徑：{os.path.basename(output_wav_path)}")

給予劇本，生成對話

import os
import torch
import numpy as np
import soundfile as sf
import datetime
import time
import re
from opencc import OpenCC
from qwen_tts import Qwen3TTSModel

# 1. 設定環境變數以優化 CPU 執行緒使用 [2]
os.environ["OMP_NUM_THREADS"] = "4"

# --- A. 路徑與初始化設定 ---
base_dir = r"C:\QW_Voice"
script_path = os.path.join(base_dir, "QW_Script.txt")
MAX_CHARS = 150  # 智慧合併字數上限，確保語義連貫 [3]

# 獲取日期時間戳記
timestamp = datetime.datetime.now().strftime("%Y-%m-%d,%H-%M-%S")
# 修改 1: 輸出音檔與簡體文本直接放在 C:\QW_Voice
output_wav_path = os.path.join(base_dir, f"Qwen-Audio_{timestamp}.wav")
output_script_path = os.path.join(base_dir, f"Qwen-Script-Simplified_{timestamp}.txt")

# 初始化 OpenCC 與 模型
cc = OpenCC('t2s')
print("正在載入 Qwen3-TTS 模型...")
# 使用 1.7B-Base 模型，支援 3 秒快速語音複製 [4, 5]
model = Qwen3TTSModel.from_pretrained(
    "Qwen/Qwen3-TTS-12Hz-1.7B-Base", 
    dtype=torch.bfloat16, # 官方建議使用的資料類型 [6]
    device_map="auto",
    offload_folder="offload"
)

# 用於儲存聲紋物件，避免重複提取 [7]
prompt_cache = {}

def get_voice_prompt(name):
    """根據名稱獲取或生成聲紋檔案"""
    if name in prompt_cache:
        return prompt_cache[name]
    
    char_work_dir = os.path.join(base_dir, f"{name}_QW")
    pt_path = os.path.join(char_work_dir, f"{name}_voice_print.pt")
    
    if os.path.exists(pt_path):
        print(f" -> 載入角色聲紋: {name}")
        prompt = torch.load(pt_path, weights_only=False)
    else:
        print(f" -> 未發現聲紋，正在為 {name} 提取特徵...")
        audio_ref = os.path.join(char_work_dir, f"{name}_Voice.wav")
        text_ref_file = os.path.join(char_work_dir, f"{name}_reftext.txt")
        
        if not os.path.exists(audio_ref) or not os.path.exists(text_ref_file):
            raise FileNotFoundError(f"找不到角色 {name} 的參考資源於 {char_work_dir}")
        
        with open(text_ref_file, "r", encoding="utf-8") as f:
            ref_text = f.read().strip()
        
        # 建立可重複使用的聲紋 Prompt [7]
        prompt = model.create_voice_clone_prompt(ref_audio=audio_ref, ref_text=ref_text)
        os.makedirs(char_work_dir, exist_ok=True)
        torch.save(prompt, pt_path)
    
    prompt_cache[name] = prompt
    return prompt

def split_and_merge_text(text):
    """智慧分段邏輯：按標點拆分後合併，提升語意理解與情緒表達 [3]"""
    raw_segments = re.split(r'([。！？?\n])', text)
    initial_segments = []
    temp_seg = ""
    for s in raw_segments:
        if s in "。！？?\n":
            initial_segments.append(temp_seg + s)
            temp_seg = ""
        else:
            temp_seg = s
    if temp_seg: initial_segments.append(temp_seg)

    chunks = []
    current_chunk = ""
    for seg in initial_segments:
        if len(current_chunk) + len(seg) <= MAX_CHARS:
            current_chunk += seg
        else:
            if current_chunk: chunks.append(current_chunk.strip())
            current_chunk = seg
    if current_chunk: chunks.append(current_chunk.strip())
    return chunks

# --- B. 讀取與解析劇本 ---
if not os.path.exists(script_path):
    print(f"錯誤：找不到劇本檔案 {script_path}")
    exit()

all_audio_segments = []
simplified_script_lines = []

with open(script_path, "r", encoding="utf-8") as f:
    lines = [line.strip() for line in f if line.strip()]

total_lines = len(lines)
total_start_time = time.time()

print(f"\n--- 開始解析劇本並生成音訊 (總計 {total_lines} 行) ---")

for idx, line in enumerate(lines):
    # 修改 2: 僅使用 Tab (\t) 分隔，解決 'list' object has no attribute 'strip' 錯誤
    parts = line.split('\t', 1)
    
    if len(parts) < 2:
        print(f"警告：第 {idx+1} 行缺少 Tab 分隔符號，已略過。")
        continue

    char_name = parts[0].strip()  # 取得角色名稱並去空白
    original_text = parts[1].strip()  # 取得台詞內容並去空白

    # 1. 簡化文字與存檔準備
    simplified_text = cc.convert(original_text)
    simplified_script_lines.append(f"{char_name}\t{simplified_text}")

    # 2. 獲取聲紋 [7]
    try:
        current_prompt = get_voice_prompt(char_name)
    except Exception as e:
        print(f"跳過行 {idx+1}: {e}")
        continue

    # 3. 智慧分段並生成 [6, 8]
    text_chunks = split_and_merge_text(simplified_text)
    print(f"[{idx+1}/{total_lines}] {char_name}: {simplified_text[:15]}... (分 {len(text_chunks)} 段)")

    for chunk in text_chunks:
        # 設定 max_new_tokens=2048 以獲得最佳品質 [6, 8]
        audio_seg, sampling_rate = model.generate_voice_clone(
            text=chunk,
            voice_clone_prompt=current_prompt,
            max_new_tokens=2048
        )
        
        if isinstance(audio_seg, list):
            audio_seg = np.array(audio_seg)
        elif torch.is_tensor(audio_seg):
            audio_seg = audio_seg.detach().cpu().numpy()
        
        all_audio_segments.append(audio_seg.astype(np.float32).flatten())

# --- C. 拼接、輸出音檔與劇本 ---
if all_audio_segments:
    print("\n正在整合最終音軌...")
    final_audio = np.concatenate(all_audio_segments)
    sf.write(output_wav_path, final_audio, sampling_rate)

    with open(output_script_path, "w", encoding="utf-8") as f:
        f.write("\n".join(simplified_script_lines))

    print(f"\n全部完成！總耗時: {(time.time() - total_start_time)/60:.1f} 分鐘")
    print(f"音檔已儲存至：{output_wav_path}")
    print(f"簡體劇本已儲存至：{output_script_path}")
else:
    print("未產生任何音訊片段。")

示範一下

這個是劇本，腳色名稱與內容，中間設定用 tab 隔開，一則對話一行！

Patrick	Hello 大家，我是「水壺翔」！ 今天我要現場示範，如何「使喚」我的數位分身，也是我的最強助手：N V D A！
NVDA	NVDA 已啟動！
Patrick	OK，它已經準備好，要「讀」給我聽囉！ 我先把你的語速調慢一點。
NVDA	速度速度 15 ！
Patrick	好，現在請讀出我剛剛在記事本裡寫的那則「冷笑話」！
NVDA	「有一天，小明對者勒色桶說：『我愛你！』，勒色桶回答：『請不要亂丟勒色。』」
Patrick	哈哈哈，這是我寫過最冷的一則。 大家看，只要是文字，NVDA 都能一行一行讀得很順。 但如果遇到「表格」呢？ 嘿嘿，NVDA 你來示範一下，如果我不指揮你，你會怎麼念？
NVDA	「8。 1。 6。 3。 5。 7。 4。 9。 2。」
Patrick	聽到了嗎？大家聽起來像是一串亂碼。 但其實這是一個 3 乘以 3 的九宮格表格喔！ 現在，看我使出「座標定位法」，我按住 ctrl 加 alt 加上方向鍵。 N V D A ，請報出第二列的數字。
NVDA	「第二列，第一欄：3。第二欄：5。第三欄：7。」
Patrick	酷吧！ 同一個表格，我透過手上的組合鍵，就能在腦中建立「心理地圖」，知道 5 就在正中間。 這就是我們視障者閱讀表格的秘訣！
NVDA	主人，表格我沒問題，那這串長得像外星語的代碼呢？
Patrick	哦，那是 La Tex 代碼！ 看我按下快捷鍵，召喚妳的夥伴 A8M 附加元件來幫忙。
NVDA	分數，2A 分之負 B，加減根號，B 的平方減 4AC
Patrick	帥吧！ 透過 A8M 這個「翻譯機」，我能輕鬆搞定複雜數學。最後，我來考考妳，螢幕上這張照片是什麼？
NVDA	「圖片。」
Patrick	蛤，甚麼？
NVDA	圖片。 圖片。
Patrick	娃，就這樣？ 裡面明明是一塊草莓蛋糕耶！
NVDA	主人，沒有人幫這張圖設定「替代文字」啦！ 對我來說，沒有文字說明的圖片，就只是一個沒意義的框框而已。
Patrick	嗯哼，大家聽到了吧，這就是數位學習平台，最需要大家幫忙的地方。 NVDA，今天辛苦你囉！ 按下 `NVDA + Q` ，我們收工。
NVDA	結束 NVDA。
Patrick	OK，NVDA 下班了。今天大家學到了嗎？這裡幫大家整理三個重點：第一是「讀文字」： NVDA 會一行一行念，很適合聽笑話或課文。 第二是「看表格」：不能只靠聽，要用 `Ctrl 加 Alt 加上方向鍵` 來定位，在腦中蓋出「心理地圖」。 第三是「寫數學」：我們是用 La Tex 代碼搭配 A8M 外掛程式當作橋樑，讓代碼變回好聽的數學公式。 最後，別忘了幫照片寫下「替代文字」，我們才能跟你們一起，分享這塊草莓蛋糕喔！ 我是「水壺翔」，我們下次見！ 掰掰！

接著輸入我跟電腦語音的聲音，然後再把結果拿來剪剪貼貼，就可以變成以下這樣：
⠀⠀（講話聲音都是電腦生成的喔！）

玩完之後

害怕：只要有乾淨的聲音，都可以輕易被模仿。以後自己分享時，可以合理要求，為我墊上背景音樂，保護我的個資！
方便：專業麥克風，只要拿出來一次，未來的聲音，都可以讓電腦產出，再也不用辛苦的，到處追人錄音，就可以實現我的小劇場了
期待：是因為看到這篇，加上自己又是，對聲音算敏感的視障者，所以才開始研究~~（盧 AI）~~的！希望因著這些工具的出現，我們能更有行動力，來探索自己的興趣喔！

如果想看更多，如何操作電腦的內容，歡迎去「過度速成班」逛逛喔！