Python语音转文字案例实操？

wen python案例 2026-06-07 01:28:23 2

本文目录导读：

方法一：使用SpeechRecognition库（在线识别）
方法二：使用百度语音识别API
方法三：使用Whisper（OpenAI开源模型，支持离线）
方法四：完整GUI应用（使用Tkinter）
方法五：语音转文字的进阶应用
使用注意事项
最佳实践建议

我将为您提供一个完整的Python语音转文字案例,使用多种主流方法实现。

使用SpeechRecognition库（在线识别）

安装依赖

pip install SpeechRecognition
pip install PyAudio  # 录音所需
pip install pocketsphinx  # 离线识别

基本示例代码

import speech_recognition as sr
# 初始化识别器
recognizer = sr.Recognizer()
def record_and_recognize():
    """使用麦克风录音并识别"""
    with sr.Microphone() as source:
        print("请说话...")
        # 调整环境噪音
        recognizer.adjust_for_ambient_noise(source, duration=1)
        # 录音
        audio = recognizer.listen(source, timeout=5, phrase_time_limit=10)
        print("录音完成，正在识别...")
    try:
        # 使用Google语音识别（需要联网）
        text = recognizer.recognize_google(audio, language='zh-CN')
        print(f"识别结果: {text}")
        return text
    except sr.UnknownValueError:
        print("无法识别语音")
    except sr.RequestError as e:
        print(f"请求出错: {e}")
    except Exception as e:
        print(f"错误: {e}")
    return ""
# 执行识别
result = record_and_recognize()
print(f"最终结果: {result}")

使用百度语音识别API

安装依赖

pip install baidu-aip

百度API实现

from aip import AipSpeech
import wave
import pyaudio
class BaiduSpeechRecognition:
    def __init__(self, app_id, api_key, secret_key):
        """初始化百度语音识别客户端"""
        self.client = AipSpeech(app_id, api_key, secret_key)
    def record_audio(self, filename="output.wav", record_seconds=5):
        """录制音频"""
        CHUNK = 1024
        FORMAT = pyaudio.paInt16
        CHANNELS = 1
        RATE = 16000
        p = pyaudio.PyAudio()
        stream = p.open(format=FORMAT,
                       channels=CHANNELS,
                       rate=RATE,
                       input=True,
                       frames_per_buffer=CHUNK)
        print("开始录音...")
        frames = []
        for _ in range(0, int(RATE / CHUNK * record_seconds)):
            data = stream.read(CHUNK)
            frames.append(data)
        print("录音结束!")
        stream.stop_stream()
        stream.close()
        p.terminate()
        # 保存音频文件
        wf = wave.open(filename, 'wb')
        wf.setnchannels(CHANNELS)
        wf.setsampwidth(p.get_sample_size(FORMAT))
        wf.setframerate(RATE)
        wf.writeframes(b''.join(frames))
        wf.close()
        return filename
    def recognize_from_file(self, audio_file):
        """从音频文件识别语音"""
        with open(audio_file, 'rb') as fp:
            result = self.client.asr(fp.read(), 'wav', 16000, {
                'dev_pid': 1537,  # 普通话(支持简单的英文识别)
            })
        if result['err_no'] == 0:
            return result['result'][0]
        else:
            print(f"识别失败: {result['err_msg']}")
            return ""
# 使用示例
def baidu_demo():
    # 在百度AI平台申请：https://console.bce.baidu.com/ai/#/ai/speech/overview/resource/getFree
    APP_ID = '你的APP_ID'
    API_KEY = '你的API_KEY'
    SECRET_KEY = '你的SECRET_KEY'
    recognizer = BaiduSpeechRecognition(APP_ID, API_KEY, SECRET_KEY)
    # 录音
    audio_file = recognizer.record_audio(record_seconds=5)
    # 识别
    result = recognizer.recognize_from_file(audio_file)
    print(f"识别结果: {result}")
# 运行
# baidu_demo()

使用Whisper（OpenAI开源模型，支持离线）

安装依赖

pip install openai-whisper
pip install pyaudio  # 如果需要录音

Whisper实现

import whisper
import sounddevice as sd
import numpy as np
import scipy.io.wavfile as wavfile
class WhisperRecognition:
    def __init__(self, model_size="base"):
        """
        初始化Whisper模型
        model_size: tiny, base, small, medium, large
        """
        print(f"加载Whisper {model_size} 模型...")
        self.model = whisper.load_model(model_size)
    def record_audio(self, duration=5, sample_rate=16000):
        """录制音频"""
        print("开始录音...")
        recording = sd.rec(int(duration * sample_rate), 
                          samplerate=sample_rate, 
                          channels=1)
        sd.wait()
        print("录音结束!")
        return recording, sample_rate
    def recognize_from_array(self, audio_array, sample_rate=16000):
        """从音频数组识别"""
        result = self.model.transcribe(audio_array, language='zh')
        return result["text"]
    def recognize_from_file(self, audio_file):
        """从音频文件识别"""
        result = self.model.transcribe(audio_file, language='zh')
        return result["text"]
# 使用示例
def whisper_demo():
    recognizer = WhisperRecognition(model_size="base")
    # 方法1：实时录音识别
    print("=== 实时录音识别 ===")
    audio_data, sr = recognizer.record_audio(duration=5)
    text = recognizer.recognize_from_array(audio_data[:, 0], sr)
    print(f"识别结果: {text}")
    # 方法2：从文件识别
    # print("=== 文件识别 ===")
    # text = recognizer.recognize_from_file("test_audio.wav")
    # print(f"识别结果: {text}")
# 运行
# whisper_demo()

完整GUI应用（使用Tkinter）

import tkinter as tk
from tkinter import scrolledtext
import threading
import speech_recognition as sr
class SpeechToTextApp:
    def __init__(self, root):
        self.root = root
        self.root.title("语音转文字")
        self.root.geometry("600x500")
        self.recognizer = sr.Recognizer()
        self.is_recording = False
        self.setup_ui()
    def setup_ui(self):
        # 标题
        title = tk.Label(self.root, text="语音转文字工具", font=("Arial", 20))
        title.pack(pady=10)
        # 按钮框架
        btn_frame = tk.Frame(self.root)
        btn_frame.pack(pady=10)
        self.record_btn = tk.Button(btn_frame, text="开始录音", 
                                   command=self.toggle_recording,
                                   bg="green", fg="white", width=15)
        self.record_btn.pack(side=tk.LEFT, padx=5)
        clear_btn = tk.Button(btn_frame, text="清除", 
                             command=self.clear_text,
                             bg="orange", width=15)
        clear_btn.pack(side=tk.LEFT, padx=5)
        copy_btn = tk.Button(btn_frame, text="复制结果", 
                            command=self.copy_text,
                            bg="blue", fg="white", width=15)
        copy_btn.pack(side=tk.LEFT, padx=5)
        # 文本显示区域
        self.text_area = scrolledtext.ScrolledText(self.root, 
                                                  wrap=tk.WORD, 
                                                  width=70, 
                                                  height=20,
                                                  font=("Arial", 12))
        self.text_area.pack(padx=20, pady=10, fill=tk.BOTH, expand=True)
        # 状态栏
        self.status_label = tk.Label(self.root, text="准备就绪", 
                                    bd=1, relief=tk.SUNKEN, anchor=tk.W)
        self.status_label.pack(fill=tk.X)
    def toggle_recording(self):
        if not self.is_recording:
            self.is_recording = True
            self.record_btn.config(text="停止录音", bg="red")
            self.status_label.config(text="正在录音...")
            # 在独立线程中录音
            thread = threading.Thread(target=self.record_and_recognize)
            thread.daemon = True
            thread.start()
        else:
            self.is_recording = False
            self.record_btn.config(text="开始录音", bg="green")
            self.status_label.config(text="录音已停止")
    def record_and_recognize(self):
        with sr.Microphone() as source:
            self.recognizer.adjust_for_ambient_noise(source, duration=0.5)
            try:
                audio = self.recognizer.listen(source, timeout=10, phrase_time_limit=5)
                self.status_label.config(text="正在识别...")
                text = self.recognizer.recognize_google(audio, language='zh-CN')
                self.text_area.insert(tk.END, f"我: {text}\n")
                self.text_area.see(tk.END)
                self.status_label.config(text="识别完成")
            except sr.WaitTimeoutError:
                self.status_label.config(text="超时，未检测到语音")
            except sr.UnknownValueError:
                self.status_label.config(text="无法识别语音")
            except sr.RequestError as e:
                self.status_label.config(text=f"网络错误: {e}")
            except Exception as e:
                self.status_label.config(text=f"错误: {str(e)}")
            finally:
                self.is_recording = False
                self.record_btn.config(text="开始录音", bg="green")
    def clear_text(self):
        self.text_area.delete(1.0, tk.END)
        self.status_label.config(text="内容已清除")
    def copy_text(self):
        content = self.text_area.get(1.0, tk.END)
        self.root.clipboard_clear()
        self.root.clipboard_append(content)
        self.status_label.config(text="已复制到剪贴板")
# 运行GUI应用
def run_gui():
    root = tk.Tk()
    app = SpeechToTextApp(root)
    root.mainloop()
# 取消注释运行
# run_gui()

语音转文字的进阶应用

实时语音识别 + 关键词提取

import speech_recognition as sr
import jieba
from collections import Counter
class AdvancedSTT:
    def __init__(self):
        self.recognizer = sr.Recognizer()
        self.text_history = []
    def real_time_recognition(self):
        """实时语音识别并统计关键词"""
        with sr.Microphone() as source:
            print("开始监听，按Ctrl+C退出...")
            self.recognizer.adjust_for_ambient_noise(source, duration=1)
            try:
                while True:
                    print("请说话...")
                    audio = self.recognizer.listen(source, timeout=5)
                    text = self.recognizer.recognize_google(audio, language='zh-CN')
                    print(f"识别: {text}")
                    self.text_history.append(text)
                    # 关键词提取
                    words = jieba.lcut(text)
                    important_words = [w for w in words if len(w) > 1]
                    if important_words:
                        print(f"关键词: {important_words}")
            except KeyboardInterrupt:
                print("\n结束录音")
                self.show_statistics()
            except Exception as e:
                print(f"错误: {e}")
    def show_statistics(self):
        """显示统计信息"""
        print("\n=== 统计信息 ===")
        print(f"总句子数: {len(self.text_history)}")
        # 统计词频
        all_words = []
        for text in self.text_history:
            all_words.extend(jieba.lcut(text))
        word_freq = Counter(all_words)
        print("高频词:")
        for word, count in word_freq.most_common(5):
            print(f"  {word}: {count}次")
# 使用示例
def advanced_demo():
    stt = AdvancedSTT()
    stt.real_time_recognition()
# 取消注释运行
# advanced_demo()

使用注意事项

网络要求：在线API需要稳定的网络连接
音频质量：保持麦克风清晰，减少环境噪音
模型选择：Whisper的large模型效果最好但需要更多资源
API密钥：使用百度等API需要申请并妥善保管密钥
隐私保护：在线服务会传输音频数据，敏感内容建议使用离线方案

最佳实践建议

# 根据需求选择合适的方案
def get_stt_solution(use_case):
    """
    根据使用场景推荐方案
    """
    solutions = {
        "quick_demo": "SpeechRecognition + Google API",
        "production": "百度/阿里云 API (稳定可靠)",
        "offline": "Whisper (支持离线，准确度高)",
        "realtime": "Vosk (轻量级，适合实时应用)",
        "batch": "讯飞API (适合大批量处理)"
    }
    return solutions.get(use_case, "请选择: quick_demo, production, offline, realtime, batch")
print(get_stt_solution("offline"))  # 输出: Whisper

您可以根据实际需求选择合适的方法，如果您需要特定平台的实现或有其他要求,请告诉我！

标签： Python编程