Skip to content

在 Web 项目中集成文本转语音功能

本文将通过一个完整的实战案例,演示如何在 Web 项目中集成文本转语音(TTS)功能。我们将涵盖从基础实现到高级优化的完整流程。

项目需求分析

功能需求

  • ✅ 文本输入和语音播放
  • ✅ 多种语音选择
  • ✅ 语速、音调调节
  • ✅ 播放控制(暂停、继续、停止)
  • ✅ 进度显示
  • ✅ 音频下载

技术选型

方案优点缺点适用场景
Web Speech API免费、简单功能有限基础需求
云服务 API功能强大需付费商业应用
自建服务完全控制维护成本高定制需求

我们采用混合方案:基础功能使用 Web Speech API,高级功能集成云服务。

项目初始化

创建项目

bash
# 使用 Vite 创建项目
npm create vite@latest tts-demo -- --template vue
cd tts-demo
npm install

# 安装依赖
npm install axios element-plus

项目结构

tts-demo/
├── src/
│   ├── components/
│   │   ├── TTSPlayer.vue       # 主播放器组件
│   │   ├── VoiceSelector.vue   # 语音选择器
│   │   └── Controls.vue        # 控制面板
│   ├── services/
│   │   ├── ttsService.js       # TTS 服务封装
│   │   └── cloudTTSService.js  # 云服务封装
│   ├── utils/
│   │   └── audioUtils.js       # 音频工具函数
│   ├── App.vue
│   └── main.js
├── public/
└── package.json

核心服务实现

1. TTS 基础服务

javascript
// src/services/ttsService.js

class TTSService {
  constructor() {
    this.synth = window.speechSynthesis;
    this.utterance = null;
    this.voices = [];
    this.isPlaying = false;
    this.isPaused = false;
    this.callbacks = {
      onStart: null,
      onEnd: null,
      onError: null,
      onBoundary: null
    };
    
    this.initVoices();
  }

  // 初始化语音列表
  initVoices() {
    const loadVoices = () => {
      this.voices = this.synth.getVoices();
      // 按语言分组
      return this.voices.reduce((acc, voice) => {
        const lang = voice.lang.split('-')[0];
        if (!acc[lang]) acc[lang] = [];
        acc[lang].push(voice);
        return acc;
      }, {});
    };

    // 某些浏览器需要等待 voiceschanged 事件
    if (this.synth.getVoices().length > 0) {
      return loadVoices();
    }
    
    return new Promise(resolve => {
      this.synth.onvoiceschanged = () => resolve(loadVoices());
    });
  }

  // 获取语音列表
  getVoices() {
    return this.voices;
  }

  // 获取中文语音
  getChineseVoices() {
    return this.voices.filter(voice => 
      voice.lang.startsWith('zh') || voice.lang.includes('CN')
    );
  }

  // 播放语音
  speak(text, options = {}) {
    return new Promise((resolve, reject) => {
      // 停止当前播放
      this.stop();

      this.utterance = new SpeechSynthesisUtterance(text);
      
      // 设置参数
      if (options.voice) {
        this.utterance.voice = options.voice;
      }
      this.utterance.rate = options.rate || 1.0;
      this.utterance.pitch = options.pitch || 1.0;
      this.utterance.volume = options.volume || 1.0;
      this.utterance.lang = options.lang || 'zh-CN';

      // 事件处理
      this.utterance.onstart = () => {
        this.isPlaying = true;
        this.isPaused = false;
        if (this.callbacks.onStart) this.callbacks.onStart();
      };

      this.utterance.onend = () => {
        this.isPlaying = false;
        this.isPaused = false;
        if (this.callbacks.onEnd) this.callbacks.onEnd();
        resolve();
      };

      this.utterance.onerror = (event) => {
        this.isPlaying = false;
        if (this.callbacks.onError) this.callbacks.onError(event);
        reject(event);
      };

      this.utterance.onboundary = (event) => {
        if (this.callbacks.onBoundary) {
          this.callbacks.onBoundary(event);
        }
      };

      this.synth.speak(this.utterance);
    });
  }

  // 暂停
  pause() {
    if (this.isPlaying && !this.isPaused) {
      this.synth.pause();
      this.isPaused = true;
    }
  }

  // 继续
  resume() {
    if (this.isPlaying && this.isPaused) {
      this.synth.resume();
      this.isPaused = false;
    }
  }

  // 停止
  stop() {
    this.synth.cancel();
    this.isPlaying = false;
    this.isPaused = false;
  }

  // 设置回调
  on(event, callback) {
    const eventMap = {
      'start': 'onStart',
      'end': 'onEnd',
      'error': 'onError',
      'boundary': 'onBoundary'
    };
    
    if (eventMap[event]) {
      this.callbacks[eventMap[event]] = callback;
    }
  }

  // 销毁
  destroy() {
    this.stop();
    this.callbacks = {};
  }
}

// 导出单例
export default new TTSService();

2. 云服务集成

javascript
// src/services/cloudTTSService.js

import axios from 'axios';

class CloudTTSService {
  constructor(config = {}) {
    // 支持 Google Cloud TTS / Azure / 自定义后端
    this.provider = config.provider || 'google';
    this.apiKey = config.apiKey;
    this.baseUrl = config.baseUrl || '/api/tts';
  }

  // 设置提供商
  setProvider(provider, config = {}) {
    this.provider = provider;
    Object.assign(this, config);
  }

  // Google Cloud TTS
  async googleTTS(text, options = {}) {
    const response = await axios.post(
      `https://texttospeech.googleapis.com/v1/text:synthesize?key=${this.apiKey}`,
      {
        input: { text },
        voice: {
          languageCode: options.lang || 'zh-CN',
          name: options.voiceName || 'zh-CN-Wavenet-A'
        },
        audioConfig: {
          audioEncoding: 'MP3',
          speakingRate: options.rate || 1.0,
          pitch: options.pitch || 0
        }
      }
    );
    
    // 返回 base64 音频
    return response.data.audioContent;
  }

  // Azure TTS
  async azureTTS(text, options = {}) {
    // SSML 格式
    const ssml = `
      <speak version="1.0" xmlns="http://www.w3.org/2001/10/synthesis" xml:lang="${options.lang || 'zh-CN'}">
        <voice name="${options.voiceName || 'zh-CN-XiaoxiaoNeural'}">
          <prosody rate="${options.rate || '1.0'}" pitch="${options.pitch || '0%'}">
            ${text}
          </prosody>
        </voice>
      </speak>
    `;

    const response = await axios.post(
      `https://${options.region || 'eastasia'}.tts.speech.microsoft.com/cognitiveservices/v1`,
      ssml,
      {
        headers: {
          'Ocp-Apim-Subscription-Key': this.apiKey,
          'Content-Type': 'application/ssml+xml',
          'X-Microsoft-OutputFormat': 'audio-16khz-128kbitrate-mono-mp3'
        },
        responseType: 'arraybuffer'
      }
    );

    return response.data;
  }

  // 自定义后端
  async customBackend(text, options = {}) {
    const response = await axios.post(this.baseUrl, {
      text,
      lang: options.lang || 'zh-CN',
      voice: options.voice,
      rate: options.rate || 1.0,
      pitch: options.pitch || 1.0
    }, {
      responseType: 'arraybuffer'
    });

    return response.data;
  }

  // 统一调用接口
  async synthesize(text, options = {}) {
    switch (this.provider) {
      case 'google':
        return this.googleTTS(text, options);
      case 'azure':
        return this.azureTTS(text, options);
      case 'custom':
        return this.customBackend(text, options);
      default:
        throw new Error(`Unknown provider: ${this.provider}`);
    }
  }

  // 保存音频文件
  async downloadAudio(text, filename = 'speech.mp3', options = {}) {
    const audioData = await this.synthesize(text, options);
    
    // 创建 Blob
    const blob = new Blob([audioData], { type: 'audio/mpeg' });
    const url = URL.createObjectURL(blob);
    
    // 创建下载链接
    const link = document.createElement('a');
    link.href = url;
    link.download = filename;
    link.click();
    
    // 清理
    URL.revokeObjectURL(url);
  }
}

export default CloudTTSService;

3. 音频工具函数

javascript
// src/utils/audioUtils.js

// Base64 转 Blob
export function base64ToBlob(base64, mimeType = 'audio/mpeg') {
  const byteCharacters = atob(base64);
  const byteArrays = [];
  
  for (let offset = 0; offset < byteCharacters.length; offset += 512) {
    const slice = byteCharacters.slice(offset, offset + 512);
    const byteNumbers = new Array(slice.length);
    
    for (let i = 0; i < slice.length; i++) {
      byteNumbers[i] = slice.charCodeAt(i);
    }
    
    const byteArray = new Uint8Array(byteNumbers);
    byteArrays.push(byteArray);
  }
  
  return new Blob(byteArrays, { type: mimeType });
}

// 音频时长计算
export function estimateDuration(text, rate = 150) {
  // 平均语速约 150 字/分钟
  const chars = text.length;
  const minutes = chars / rate;
  return Math.ceil(minutes * 60); // 返回秒数
}

// 音频可视化
export function visualizeAudio(audioContext, canvas, audioSource) {
  const analyser = audioContext.createAnalyser();
  audioSource.connect(analyser);
  analyser.connect(audioContext.destination);
  
  const canvasCtx = canvas.getContext('2d');
  const bufferLength = analyser.frequencyBinCount;
  const dataArray = new Uint8Array(bufferLength);
  
  function draw() {
    requestAnimationFrame(draw);
    
    analyser.getByteTimeDomainData(dataArray);
    
    canvasCtx.fillStyle = 'rgb(200, 200, 200)';
    canvasCtx.fillRect(0, 0, canvas.width, canvas.height);
    
    canvasCtx.lineWidth = 2;
    canvasCtx.strokeStyle = 'rgb(0, 0, 0)';
    canvasCtx.beginPath();
    
    const sliceWidth = canvas.width / bufferLength;
    let x = 0;
    
    for (let i = 0; i < bufferLength; i++) {
      const v = dataArray[i] / 128.0;
      const y = v * canvas.height / 2;
      
      if (i === 0) {
        canvasCtx.moveTo(x, y);
      } else {
        canvasCtx.lineTo(x, y);
      }
      
      x += sliceWidth;
    }
    
    canvasCtx.lineTo(canvas.width, canvas.height / 2);
    canvasCtx.stroke();
  }
  
  draw();
}

// 文本分段(长文本处理)
export function splitTextIntoChunks(text, maxLength = 200) {
  const chunks = [];
  let currentChunk = '';
  
  // 按句子分割
  const sentences = text.split(/[。!?;\n]/);
  
  for (const sentence of sentences) {
    if ((currentChunk + sentence).length > maxLength) {
      if (currentChunk) {
        chunks.push(currentChunk);
        currentChunk = '';
      }
      // 超长句子强制分割
      if (sentence.length > maxLength) {
        for (let i = 0; i < sentence.length; i += maxLength) {
          chunks.push(sentence.slice(i, i + maxLength));
        }
      } else {
        currentChunk = sentence;
      }
    } else {
      currentChunk += (currentChunk ? '' : '') + sentence;
    }
  }
  
  if (currentChunk) {
    chunks.push(currentChunk);
  }
  
  return chunks;
}

Vue 组件实现

1. 主播放器组件

vue
<!-- src/components/TTSPlayer.vue -->
<template>
  <div class="tts-player">
    <div class="input-section">
      <el-input
        v-model="text"
        type="textarea"
        :rows="6"
        placeholder="请输入要转换的文本..."
        maxlength="5000"
        show-word-limit
      />
    </div>

    <div class="controls-section">
      <VoiceSelector
        v-model="selectedVoice"
        :voices="voices"
      />
      
      <div class="slider-group">
        <div class="slider-item">
          <span>语速: {{ rate.toFixed(1) }}</span>
          <el-slider
            v-model="rate"
            :min="0.5"
            :max="2"
            :step="0.1"
          />
        </div>
        
        <div class="slider-item">
          <span>音调: {{ pitch.toFixed(1) }}</span>
          <el-slider
            v-model="pitch"
            :min="0.5"
            :max="2"
            :step="0.1"
          />
        </div>
        
        <div class="slider-item">
          <span>音量: {{ volume.toFixed(1) }}</span>
          <el-slider
            v-model="volume"
            :min="0"
            :max="1"
            :step="0.1"
          />
        </div>
      </div>
    </div>

    <div class="player-controls">
      <el-button
        type="primary"
        @click="play"
        :disabled="!text || isLoading"
        :loading="isLoading"
      >
        {{ isPlaying ? '播放中...' : '播放' }}
      </el-button>
      
      <el-button
        @click="togglePause"
        :disabled="!isPlaying"
      >
        {{ isPaused ? '继续' : '暂停' }}
      </el-button>
      
      <el-button
        @click="stop"
        :disabled="!isPlaying && !isPaused"
      >
        停止
      </el-button>
      
      <el-button
        @click="download"
        :disabled="!text"
      >
        下载音频
      </el-button>
    </div>

    <div class="progress-section" v-if="isPlaying || isPaused">
      <el-progress
        :percentage="progress"
        :status="isPlaying ? '' : 'warning'"
      />
      <p class="status-text">
        {{ statusText }}
      </p>
    </div>
  </div>
</template>

<script setup>
import { ref, computed, onMounted, onUnmounted } from 'vue';
import { ElMessage } from 'element-plus';
import ttsService from '../services/ttsService';
import cloudTTSService from '../services/cloudTTSService';
import VoiceSelector from './VoiceSelector.vue';

// 状态
const text = ref('');
const voices = ref([]);
const selectedVoice = ref(null);
const rate = ref(1.0);
const pitch = ref(1.0);
const volume = ref(1.0);
const isPlaying = ref(false);
const isPaused = ref(false);
const isLoading = ref(false);
const progress = ref(0);
const charIndex = ref(0);

// 计算属性
const statusText = computed(() => {
  if (isPaused.value) return '已暂停';
  if (isPlaying.value) return `正在播放... (${charIndex.value}/${text.value.length})`;
  return '';
});

// 初始化
onMounted(async () => {
  await ttsService.initVoices();
  voices.value = ttsService.getVoices();
  
  // 默认选择中文语音
  const chineseVoices = ttsService.getChineseVoices();
  if (chineseVoices.length > 0) {
    selectedVoice.value = chineseVoices[0];
  }
  
  // 设置进度回调
  ttsService.on('boundary', (event) => {
    charIndex.value = event.charIndex;
    progress.value = Math.round((event.charIndex / text.value.length) * 100);
  });
  
  ttsService.on('end', () => {
    isPlaying.value = false;
    isPaused.value = false;
    progress.value = 100;
  });
  
  ttsService.on('error', (event) => {
    ElMessage.error(`播放出错: ${event.error}`);
    isPlaying.value = false;
    isPaused.value = false;
  });
});

// 播放
async function play() {
  if (!text.value.trim()) return;
  
  isLoading.value = true;
  progress.value = 0;
  charIndex.value = 0;
  
  try {
    isPlaying.value = true;
    await ttsService.speak(text.value, {
      voice: selectedVoice.value,
      rate: rate.value,
      pitch: pitch.value,
      volume: volume.value
    });
  } catch (error) {
    ElMessage.error('播放失败');
    console.error(error);
  } finally {
    isLoading.value = false;
  }
}

// 暂停/继续
function togglePause() {
  if (isPaused.value) {
    ttsService.resume();
    isPaused.value = false;
  } else {
    ttsService.pause();
    isPaused.value = true;
  }
}

// 停止
function stop() {
  ttsService.stop();
  isPlaying.value = false;
  isPaused.value = false;
  progress.value = 0;
  charIndex.value = 0;
}

// 下载音频
async function download() {
  if (!text.value.trim()) return;
  
  isLoading.value = true;
  try {
    // 使用云服务下载
    const service = new cloudTTSService({
      provider: 'custom', // 或 'google'/'azure'
      baseUrl: '/api/tts'
    });
    
    await service.downloadAudio(text.value, `speech_${Date.now()}.mp3`, {
      voice: selectedVoice.value?.name,
      rate: rate.value,
      pitch: pitch.value
    });
    
    ElMessage.success('音频已下载');
  } catch (error) {
    ElMessage.error('下载失败,请检查服务配置');
    console.error(error);
  } finally {
    isLoading.value = false;
  }
}

// 清理
onUnmounted(() => {
  ttsService.destroy();
});
</script>

<style scoped>
.tts-player {
  max-width: 800px;
  margin: 0 auto;
  padding: 20px;
}

.input-section {
  margin-bottom: 20px;
}

.controls-section {
  margin-bottom: 20px;
}

.slider-group {
  margin-top: 15px;
}

.slider-item {
  margin-bottom: 15px;
}

.slider-item span {
  display: block;
  margin-bottom: 5px;
  font-size: 14px;
  color: #666;
}

.player-controls {
  margin-bottom: 20px;
}

.progress-section {
  padding: 15px;
  background: #f5f7fa;
  border-radius: 8px;
}

.status-text {
  margin-top: 10px;
  font-size: 14px;
  color: #666;
  text-align: center;
}
</style>

2. 语音选择器组件

vue
<!-- src/components/VoiceSelector.vue -->
<template>
  <div class="voice-selector">
    <el-select
      v-model="localValue"
      placeholder="选择语音"
      filterable
      @change="handleChange"
    >
      <el-option-group
        v-for="(voices, lang) in groupedVoices"
        :key="lang"
        :label="getLanguageName(lang)"
      >
        <el-option
          v-for="voice in voices"
          :key="voice.voiceURI"
          :label="voice.name"
          :value="voice.voiceURI"
        >
          <div class="voice-option">
            <span>{{ voice.name }}</span>
            <el-tag v-if="voice.default" size="small" type="success">
              默认
            </el-tag>
          </div>
        </el-option>
      </el-option-group>
    </el-select>
    
    <el-button
      @click="playSample"
      :disabled="!localValue"
      size="small"
      style="margin-left: 10px;"
    >
      试听
    </el-button>
  </div>
</template>

<script setup>
import { ref, computed, watch } from 'vue';
import { ElMessage } from 'element-plus';

const props = defineProps({
  modelValue: {
    type: Object,
    default: null
  },
  voices: {
    type: Array,
    default: () => []
  }
});

const emit = defineEmits(['update:modelValue']);

const localValue = ref(props.modelValue?.voiceURI || '');

// 按语言分组
const groupedVoices = computed(() => {
  return props.voices.reduce((acc, voice) => {
    const lang = voice.lang.split('-')[0];
    if (!acc[lang]) acc[lang] = [];
    acc[lang].push(voice);
    return acc;
  }, {});
});

// 语言名称映射
const languageNames = {
  zh: '中文',
  en: '英语',
  ja: '日语',
  ko: '韩语',
  fr: '法语',
  de: '德语',
  es: '西班牙语'
};

function getLanguageName(langCode) {
  return languageNames[langCode] || langCode.toUpperCase();
}

// 处理选择变化
function handleChange(voiceURI) {
  const voice = props.voices.find(v => v.voiceURI === voiceURI);
  emit('update:modelValue', voice);
}

// 试听
function playSample() {
  const voice = props.voices.find(v => v.voiceURI === localValue.value);
  if (!voice) return;
  
  const utterance = new SpeechSynthesisUtterance(
    '你好,这是语音试听示例。Hello, this is a voice sample.'
  );
  utterance.voice = voice;
  window.speechSynthesis.speak(utterance);
}

// 监听外部变化
watch(() => props.modelValue, (newVal) => {
  localValue.value = newVal?.voiceURI || '';
});
</script>

<style scoped>
.voice-selector {
  display: flex;
  align-items: center;
}

.voice-option {
  display: flex;
  justify-content: space-between;
  align-items: center;
}
</style>

后端服务示例

Node.js 后端(Express)

javascript
// server.js
const express = require('express');
const multer = require('multer');
const path = require('path');
const fs = require('fs');

const app = express();
const port = 3000;

// 中间件
app.use(express.json());
app.use(express.static('public'));

// 模拟 TTS 服务(实际项目中集成真实 API)
app.post('/api/tts', async (req, res) => {
  const { text, voice, rate, pitch } = req.body;
  
  try {
    // 这里集成实际的 TTS API
    // 例如:Google Cloud TTS, Azure, 百度等
    
    // 示例:调用 Azure TTS
    // const audioData = await azureTTS(text, { voice, rate, pitch });
    
    // 临时返回模拟数据
    const audioBuffer = await generateAudio(text);
    
    res.setHeader('Content-Type', 'audio/mpeg');
    res.send(audioBuffer);
  } catch (error) {
    console.error('TTS Error:', error);
    res.status(500).json({ error: '音频生成失败' });
  }
});

// 音频文件上传
const upload = multer({ dest: 'uploads/' });
app.post('/api/upload-audio', upload.single('audio'), (req, res) => {
  res.json({ 
    filename: req.file.filename,
    path: `/audio/${req.file.filename}`
  });
});

app.listen(port, () => {
  console.log(`TTS Server running at http://localhost:${port}`);
});

// 辅助函数:生成音频(示例)
async function generateAudio(text) {
  // 实际实现需要调用真实的 TTS API
  // 这里只是占位符
  return Buffer.from('');
}

高级功能扩展

1. 长文本处理

javascript
// 分段播放长文本
async function playLongText(text, options = {}) {
  const chunks = splitTextIntoChunks(text, 200);
  
  for (const chunk of chunks) {
    await ttsService.speak(chunk, options);
    // 段落间暂停
    await new Promise(resolve => setTimeout(resolve, 500));
  }
}

2. 音频可视化

javascript
// 实时音频波形显示
import { visualizeAudio } from '../utils/audioUtils';

const audioContext = new AudioContext();
const audioSource = audioContext.createMediaStreamSource(stream);
visualizeAudio(audioContext, canvasElement, audioSource);

3. 语音识别结合

javascript
// 语音识别 + 语音合成 完整对话
class VoiceConversation {
  constructor() {
    this.recognition = new webkitSpeechRecognition();
    this.tts = ttsService;
    this.setupRecognition();
  }
  
  setupRecognition() {
    this.recognition.continuous = false;
    this.recognition.lang = 'zh-CN';
    
    this.recognition.onresult = async (event) => {
      const text = event.results[0][0].transcript;
      const response = await this.getAIResponse(text);
      await this.tts.speak(response);
    };
  }
  
  start() {
    this.recognition.start();
  }
  
  async getAIResponse(input) {
    // 调用 AI API 获取回复
    return '这是回复内容';
  }
}

性能优化建议

1. 预加载语音

javascript
// 应用启动时预加载语音列表
window.speechSynthesis.getVoices();

2. 文本缓存

javascript
// 缓存已合成的音频
const audioCache = new Map();

async function getCachedAudio(text) {
  const cacheKey = text.trim().toLowerCase();
  
  if (audioCache.has(cacheKey)) {
    return audioCache.get(cacheKey);
  }
  
  const audio = await synthesize(text);
  audioCache.set(cacheKey, audio);
  return audio;
}

3. Web Worker 处理

javascript
// worker.js
self.onmessage = async function(e) {
  const { text, options } = e.data;
  
  // 在 Worker 中处理文本分割等耗时操作
  const chunks = splitTextIntoChunks(text);
  
  self.postMessage({ type: 'ready', chunks });
};

总结

本文介绍了在 Web 项目中集成文本转语音功能的完整方案:

  1. 服务层 - 封装 Web Speech API 和云服务
  2. 组件层 - 可复用的 Vue 组件
  3. 工具层 - 音频处理辅助函数
  4. 扩展功能 - 长文本处理、可视化等

通过这个方案,你可以快速在你的 Web 项目中实现强大的文本转语音功能。根据实际需求选择合适的技术栈和服务提供商,打造流畅的用户体验。


发布于 2025-06-28

基于 VitePress 构建