intelligent_segmentation.py 16.8 KB

Raw Blame History Permalink

# AIfeng/2025-07-07 15:25:48
# 智能断句模块 - 基于静音间隔的语义分段

import time
import numpy as np
from typing import List, Dict, Optional, Tuple
from dataclasses import dataclass
from enum import Enum
import threading
import logging

class SegmentType(Enum):
    """语音片段类型"""
    WORD_CONTINUATION = "word_continuation"      # 词间连接
    PHRASE_CONNECTION = "phrase_connection"      # 短语连接
    SENTENCE_BOUNDARY = "sentence_boundary"      # 句子边界
    TOPIC_BOUNDARY = "topic_boundary"            # 话题边界

@dataclass
class SpeechSegment:
    """语音片段数据结构"""
    text: str
    start_time: float
    end_time: float
    silence_before: float
    silence_after: float
    confidence: float
    segment_type: SegmentType
    is_complete: bool = False

class IntelligentSentenceSegmentation:
    """智能断句处理器"""

    def __init__(self, config: Dict = None):
        self.config = config or self._get_default_config()
        self.silence_thresholds = self.config.get('silence_thresholds', {
            'micro_pause': 0.3,      # 词间停顿
            'phrase_pause': 1.0,     # 短语间停顿
            'sentence_pause': 2.0,   # 句子间停顿
            'topic_pause': 4.0       # 话题间停顿
        })

        self.segment_buffer = []  # 片段缓冲区
        self.user_speech_pattern = {
            'avg_pause_duration': 1.2,
            'speech_rate': 150,  # 词/分钟
            'pause_variance': 0.3
        }

        self.recent_pauses = []  # 最近的停顿记录
        self.adaptive_enabled = self.config.get('adaptive_threshold', True)

        self.logger = logging.getLogger(__name__)

    def _get_default_config(self) -> Dict:
        """获取默认配置"""
        return {
            'silence_thresholds': {
                'micro_pause': 0.3,
                'phrase_pause': 1.0,
                'sentence_pause': 2.0,
                'topic_pause': 4.0
            },
            'adaptive_threshold': True,
            'semantic_analysis': True,
            'grammar_check': True,
            'max_segment_length': 50,  # 最大片段长度（词数）
            'min_segment_length': 3    # 最小片段长度（词数）
        }

    def process_speech_segment(self, text: str, silence_duration: float,
                             timestamp: float, confidence: float) -> List[SpeechSegment]:
        """处理语音片段"""
        try:
            # 记录停顿时长用于自适应调整
            if silence_duration > 0:
                self.recent_pauses.append(silence_duration)
                if len(self.recent_pauses) > 20:  # 保持最近20个停顿记录
                    self.recent_pauses.pop(0)

            # 自适应阈值调整
            if self.adaptive_enabled:
                self._adjust_thresholds()

            # 确定片段类型
            segment_type = self._classify_segment_type(text, silence_duration)

            # 创建语音片段
            segment = SpeechSegment(
                text=text,
                start_time=timestamp,
                end_time=timestamp + len(text.split()) * 0.4,  # 估算结束时间
                silence_before=silence_duration,
                silence_after=0.0,  # 后续更新
                confidence=confidence,
                segment_type=segment_type
            )

            # 添加到缓冲区
            self.segment_buffer.append(segment)

            # 处理片段合并和分割
            processed_segments = self._process_segment_buffer()

            return processed_segments

        except Exception as e:
            self.logger.error(f"处理语音片段时出错: {e}")
            return []

    def _classify_segment_type(self, text: str, silence_duration: float) -> SegmentType:
        """分类片段类型"""
        # 确保阈值字典完整性
        if not isinstance(self.silence_thresholds, dict):
            self.silence_thresholds = self._get_default_config()['silence_thresholds']

        # 安全获取阈值，使用默认值作为后备
        micro_pause = self.silence_thresholds.get('micro_pause', 0.3)
        phrase_pause = self.silence_thresholds.get('phrase_pause', 1.0)
        sentence_pause = self.silence_thresholds.get('sentence_pause', 2.0)

        # 基于静音时长的初步分类
        if silence_duration <= micro_pause:
            return SegmentType.WORD_CONTINUATION
        elif silence_duration <= phrase_pause:
            return SegmentType.PHRASE_CONNECTION
        elif silence_duration <= sentence_pause:
            return SegmentType.SENTENCE_BOUNDARY
        else:
            return SegmentType.TOPIC_BOUNDARY

    def _process_segment_buffer(self) -> List[SpeechSegment]:
        """处理片段缓冲区"""
        if len(self.segment_buffer) < 2:
            return []

        processed_segments = []
        current_segment = self.segment_buffer[-2]  # 倒数第二个片段
        next_segment = self.segment_buffer[-1]     # 最新片段

        # 语义连接分析
        connection_type = self._analyze_semantic_connection(
            current_segment.text,
            next_segment.text,
            next_segment.silence_before
        )

        # 根据连接类型决定处理方式
        if connection_type == 'continuation':
            # 合并片段
            merged_segment = self._merge_segments(current_segment, next_segment)
            self.segment_buffer[-2] = merged_segment
            self.segment_buffer.pop()  # 移除最新片段
        elif connection_type == 'new_sentence':
            # 标记当前片段为完成
            current_segment.is_complete = True
            processed_segments.append(current_segment)

        return processed_segments

    def _analyze_semantic_connection(self, prev_text: str, current_text: str,
                                   silence_duration: float) -> str:
        """分析语义连接类型"""
        # 确保silence_thresholds是字典类型
        if not isinstance(self.silence_thresholds, dict):
            self.silence_thresholds = {
                'micro_pause': 0.3,
                'phrase_pause': 0.8,
                'sentence_pause': 1.5,
                'topic_pause': 3.0
            }

        # 语法完整性检查
        if self._is_grammatically_complete(prev_text):
            sentence_pause_threshold = self.silence_thresholds.get('sentence_pause', 1.5)
            if silence_duration >= sentence_pause_threshold:
                return 'new_sentence'

        # 语义相关性检查
        if self.config.get('semantic_analysis', True):
            semantic_score = self._calculate_semantic_similarity(prev_text, current_text)

            phrase_pause_threshold = self.silence_thresholds.get('phrase_pause', 0.8)
            if silence_duration >= phrase_pause_threshold:
                if semantic_score > 0.7:
                    return 'continuation'  # 语义相关，继续当前句子
                else:
                    return 'new_sentence'  # 语义不相关，新句子

        return 'continuation'

    def _is_grammatically_complete(self, text: str) -> bool:
        """检查语法完整性"""
        if not self.config.get('grammar_check', True):
            return False

        # 简单的语法完整性检查
        text = text.strip()

        # 检查句子结束标点
        if text.endswith(('。', '！', '？', '.', '!', '?')):
            return True

        # 检查常见的完整句式
        complete_patterns = [
            '是的', '不是', '好的', '没有', '有的', '对的', '错的',
            '可以', '不可以', '行', '不行', '是', '不是'
        ]

        for pattern in complete_patterns:
            if text.endswith(pattern):
                return True

        # 检查词数（简单启发式）
        word_count = len(text.split())
        if word_count >= self.config.get('min_complete_words', 5):
            return True

        return False

    def _calculate_semantic_similarity(self, text1: str, text2: str) -> float:
        """计算语义相似度（简化版本）"""
        # 这里使用简单的词汇重叠度作为语义相似度的近似
        words1 = set(text1.split())
        words2 = set(text2.split())

        if not words1 or not words2:
            return 0.0

        intersection = words1.intersection(words2)
        union = words1.union(words2)

        return len(intersection) / len(union) if union else 0.0

    def _merge_segments(self, segment1: SpeechSegment, segment2: SpeechSegment) -> SpeechSegment:
        """合并两个片段"""
        merged_text = f"{segment1.text} {segment2.text}"

        return SpeechSegment(
            text=merged_text,
            start_time=segment1.start_time,
            end_time=segment2.end_time,
            silence_before=segment1.silence_before,
            silence_after=segment2.silence_after,
            confidence=min(segment1.confidence, segment2.confidence),
            segment_type=segment2.segment_type,
            is_complete=False
        )

    def _adjust_thresholds(self):
        """根据用户说话习惯动态调整阈值"""
        if len(self.recent_pauses) >= 10:
            avg_pause = np.mean(self.recent_pauses)
            std_pause = np.std(self.recent_pauses)

            # 确保silence_thresholds是字典类型
            if not isinstance(self.silence_thresholds, dict):
                self.silence_thresholds = {
                    'micro_pause': 0.3,
                    'phrase_pause': 0.8,
                    'sentence_pause': 1.5,
                    'topic_pause': 3.0
                }

            # 个性化阈值调整
            self.silence_thresholds['phrase_pause'] = max(0.5, avg_pause + 0.5 * std_pause)
            self.silence_thresholds['sentence_pause'] = max(1.0, avg_pause + 1.5 * std_pause)

            phrase_threshold = self.silence_thresholds.get('phrase_pause', 0.8)
            sentence_threshold = self.silence_thresholds.get('sentence_pause', 1.5)
            self.logger.debug(f"阈值已调整: phrase={phrase_threshold:.2f}, "
                            f"sentence={sentence_threshold:.2f}")

    def get_completed_segments(self) -> List[SpeechSegment]:
        """获取已完成的片段"""
        completed = [seg for seg in self.segment_buffer if seg.is_complete]
        # 清理已完成的片段
        self.segment_buffer = [seg for seg in self.segment_buffer if not seg.is_complete]
        return completed

    def force_complete_current_segment(self) -> Optional[SpeechSegment]:
        """强制完成当前片段"""
        if self.segment_buffer:
            current_segment = self.segment_buffer[-1]
            current_segment.is_complete = True
            return current_segment
        return None

    def reset(self):
        """重置分割器状态"""
        self.segment_buffer.clear()
        self.recent_pauses.clear()
        self.logger.info("智能断句器已重置")

    def create_session(self, session_id: str):
        """创建会话"""
        # 为会话初始化相关数据结构
        self.logger.info(f"智能断句会话创建: {session_id}")

    def update_config(self, config: Dict):
        """更新配置"""
        if 'silence_thresholds' in config:
            # 更新静音阈值配置
            thresholds = config['silence_thresholds']
            self.logger.info(f"更新静音阈值配置: {thresholds}")

        if 'semantic_analysis' in config:
            # 更新语义分析配置
            semantic_config = config['semantic_analysis']
            self.logger.info(f"更新语义分析配置: {semantic_config}")

    def complete_session(self, session_id: str):
        """完成会话"""
        # 清理会话相关的缓存数据
        self.logger.info(f"智能断句会话完成: {session_id}")

    def shutdown(self):
        """关闭模块"""
        self.reset()
        self.logger.info("智能断句模块已关闭")

    def get_statistics(self) -> Dict:
        """获取统计信息"""
        return {
            'buffer_size': len(self.segment_buffer),
            'recent_pauses_count': len(self.recent_pauses),
            'avg_pause_duration': np.mean(self.recent_pauses) if self.recent_pauses else 0,
            'current_thresholds': self.silence_thresholds.copy(),
            'adaptive_enabled': self.adaptive_enabled
        }

    def process_text(self, text: str, context: Dict = None) -> Dict:
        """处理文本分割（兼容OptimizationManager调用）"""
        try:
            # 提取上下文信息
            timestamp = context.get('timestamp', time.time()) if context else time.time()
            confidence = context.get('confidence', 0.8) if context else 0.8
            silence_duration = context.get('silence_duration', 1.0) if context else 1.0

            # 处理语音片段
            segments = self.process_speech_segment(text, silence_duration, timestamp, confidence)

            # 返回处理结果
            if segments:
                # 返回最新的完整片段
                latest_segment = segments[-1]
                # 安全获取segment_type的值
                segment_type_value = latest_segment.segment_type.value if isinstance(latest_segment.segment_type, SegmentType) else str(latest_segment.segment_type)
                return {
                    'success': True,
                    'text': latest_segment.text,
                    'confidence': latest_segment.confidence,
                    'segment_type': segment_type_value,
                    'is_complete': latest_segment.is_complete
                }
            else:
                # 如果没有完整片段，返回原文本
                return {
                    'success': True,
                    'text': text,
                    'confidence': confidence,
                    'segment_type': 'continuation',
                    'is_complete': False
                }

        except Exception as e:
            self.logger.error(f"处理文本分割时出错: {e}")
            return {
                'success': False,
                'text': text,
                'confidence': 0.0,
                'error': str(e)
            }

    def get_performance_stats(self) -> Dict:
        """获取性能统计"""
        total_segments = len(self.segment_buffer)
        completed_segments = len([seg for seg in self.segment_buffer if seg.is_complete])
        avg_confidence = np.mean([seg.confidence for seg in self.segment_buffer]) if self.segment_buffer else 0.0

        return {
            'total_segments': total_segments,
            'completed_segments': completed_segments,
            'pending_segments': total_segments - completed_segments,
            'average_confidence': avg_confidence,
            'processing_efficiency': completed_segments / total_segments if total_segments > 0 else 0.0
        }

class AdaptiveSilenceThreshold:
    """自适应静音阈值调整器"""

    def __init__(self):
        self.user_speech_pattern = {
            'avg_pause_duration': 1.2,
            'speech_rate': 150,  # 词/分钟
            'pause_variance': 0.3
        }
        self.history_window = 50  # 历史窗口大小
        self.pause_history = []

    def update_speech_pattern(self, pause_duration: float, speech_rate: float = None):
        """更新用户说话模式"""
        self.pause_history.append(pause_duration)
        if len(self.pause_history) > self.history_window:
            self.pause_history.pop(0)

        # 更新平均停顿时长
        self.user_speech_pattern['avg_pause_duration'] = np.mean(self.pause_history)
        self.user_speech_pattern['pause_variance'] = np.std(self.pause_history)

        if speech_rate:
            self.user_speech_pattern['speech_rate'] = speech_rate

    def get_adaptive_thresholds(self, base_thresholds: Dict) -> Dict:
        """获取自适应阈值"""
        if len(self.pause_history) < 5:
            return base_thresholds

        avg_pause = self.user_speech_pattern['avg_pause_duration']
        variance = self.user_speech_pattern['pause_variance']

        # 基于用户习惯调整阈值
        adaptive_thresholds = base_thresholds.copy()

        # 调整系数
        adjustment_factor = min(2.0, max(0.5, avg_pause / 1.2))  # 基准1.2秒

        for key in adaptive_thresholds:
            adaptive_thresholds[key] *= adjustment_factor
            # 添加方差影响
            adaptive_thresholds[key] += variance * 0.3

        return adaptive_thresholds