Showing
16 changed files
with
3253 additions
and
5 deletions
| @@ -76,3 +76,11 @@ ANSPIRE_API_KEY= | @@ -76,3 +76,11 @@ ANSPIRE_API_KEY= | ||
| 76 | # Bocha AI Search API(用于Bocha多模态搜索,这里密钥名称虽然是Web Search,但其实是要AI Search的,申请地址:https://open.bochaai.com/) | 76 | # Bocha AI Search API(用于Bocha多模态搜索,这里密钥名称虽然是Web Search,但其实是要AI Search的,申请地址:https://open.bochaai.com/) |
| 77 | BOCHA_BASE_URL=https://api.bocha.cn/v1/ai-search | 77 | BOCHA_BASE_URL=https://api.bocha.cn/v1/ai-search |
| 78 | BOCHA_WEB_SEARCH_API_KEY= | 78 | BOCHA_WEB_SEARCH_API_KEY= |
| 79 | +# ================== GraphRAG 配置 ==================== | ||
| 80 | +# GraphRAG 功能开关(true/false),默认关闭 | ||
| 81 | +# 开启后会构建知识图谱并在章节生成前进行图谱查询 | ||
| 82 | +GRAPHRAG_ENABLED=false | ||
| 83 | + | ||
| 84 | +# GraphRAG 查询次数上限(每个章节生成前LLM可查询知识图谱的最大次数) | ||
| 85 | +# 仅在 GRAPHRAG_ENABLED=true 时生效 | ||
| 86 | +GRAPHRAG_MAX_QUERIES=3 |
| @@ -39,6 +39,21 @@ from .renderers import HTMLRenderer | @@ -39,6 +39,21 @@ from .renderers import HTMLRenderer | ||
| 39 | from .state import ReportState | 39 | from .state import ReportState |
| 40 | from .utils.config import settings, Settings | 40 | from .utils.config import settings, Settings |
| 41 | 41 | ||
| 42 | +# GraphRAG 模块导入 | ||
| 43 | +from .graphrag import ( | ||
| 44 | + StateParser, | ||
| 45 | + ForumParser, | ||
| 46 | + GraphBuilder, | ||
| 47 | + GraphStorage, | ||
| 48 | + Graph, | ||
| 49 | + QueryEngine, | ||
| 50 | +) | ||
| 51 | +from .nodes import GraphRAGQueryNode | ||
| 52 | +from .graphrag.prompts import ( | ||
| 53 | + SYSTEM_PROMPT_CHAPTER_GRAPH_ENHANCEMENT, | ||
| 54 | + format_graph_results_for_prompt | ||
| 55 | +) | ||
| 56 | + | ||
| 42 | 57 | ||
| 43 | class StageOutputFormatError(ValueError): | 58 | class StageOutputFormatError(ValueError): |
| 44 | """阶段性输出结构不符合预期时抛出的受控异常。""" | 59 | """阶段性输出结构不符合预期时抛出的受控异常。""" |
| @@ -559,6 +574,37 @@ class ReportAgent: | @@ -559,6 +574,37 @@ class ReportAgent: | ||
| 559 | self._persist_planning_artifacts(run_dir, layout_design, word_plan, template_overview) | 574 | self._persist_planning_artifacts(run_dir, layout_design, word_plan, template_overview) |
| 560 | emit('stage', {'stage': 'storage_ready', 'run_dir': str(run_dir)}) | 575 | emit('stage', {'stage': 'storage_ready', 'run_dir': str(run_dir)}) |
| 561 | 576 | ||
| 577 | + # ==================== GraphRAG 初始化 ==================== | ||
| 578 | + graphrag_enabled = getattr(self.config, 'GRAPHRAG_ENABLED', False) | ||
| 579 | + knowledge_graph = None | ||
| 580 | + graphrag_query_node = None | ||
| 581 | + | ||
| 582 | + if graphrag_enabled: | ||
| 583 | + logger.info("GraphRAG 已启用,开始构建知识图谱...") | ||
| 584 | + emit('stage', {'stage': 'graphrag_building', 'message': '正在构建知识图谱'}) | ||
| 585 | + | ||
| 586 | + try: | ||
| 587 | + knowledge_graph = self._build_knowledge_graph( | ||
| 588 | + query, normalized_reports, forum_logs, run_dir | ||
| 589 | + ) | ||
| 590 | + if knowledge_graph: | ||
| 591 | + graphrag_query_node = GraphRAGQueryNode(self.llm_client) | ||
| 592 | + graph_stats = knowledge_graph.get_stats() | ||
| 593 | + emit('stage', { | ||
| 594 | + 'stage': 'graphrag_built', | ||
| 595 | + 'node_count': graph_stats.get('total_nodes', 0), | ||
| 596 | + 'edge_count': graph_stats.get('total_edges', 0) | ||
| 597 | + }) | ||
| 598 | + logger.info(f"知识图谱构建完成: {graph_stats}") | ||
| 599 | + else: | ||
| 600 | + logger.warning("知识图谱构建失败,将使用原始流程") | ||
| 601 | + graphrag_enabled = False | ||
| 602 | + except Exception as graph_error: | ||
| 603 | + logger.exception(f"GraphRAG 构建异常: {graph_error}") | ||
| 604 | + graphrag_enabled = False | ||
| 605 | + emit('stage', {'stage': 'graphrag_error', 'error': str(graph_error)}) | ||
| 606 | + # ==================== GraphRAG 初始化结束 ==================== | ||
| 607 | + | ||
| 562 | chapters = [] | 608 | chapters = [] |
| 563 | chapter_max_attempts = max( | 609 | chapter_max_attempts = max( |
| 564 | self._CONTENT_SPARSE_MIN_ATTEMPTS, self.config.CHAPTER_JSON_MAX_ATTEMPTS | 610 | self._CONTENT_SPARSE_MIN_ATTEMPTS, self.config.CHAPTER_JSON_MAX_ATTEMPTS |
| @@ -594,11 +640,47 @@ class ReportAgent: | @@ -594,11 +640,47 @@ class ReportAgent: | ||
| 594 | best_sparse_candidate: Dict[str, Any] | None = None | 640 | best_sparse_candidate: Dict[str, Any] | None = None |
| 595 | best_sparse_score = -1 | 641 | best_sparse_score = -1 |
| 596 | fallback_used = False | 642 | fallback_used = False |
| 643 | + | ||
| 644 | + # ==================== GraphRAG 查询 ==================== | ||
| 645 | + graph_results = None | ||
| 646 | + chapter_context = generation_context.copy() | ||
| 647 | + | ||
| 648 | + if graphrag_enabled and knowledge_graph and graphrag_query_node: | ||
| 649 | + try: | ||
| 650 | + max_queries = getattr(self.config, 'GRAPHRAG_MAX_QUERIES', 3) | ||
| 651 | + section_info = { | ||
| 652 | + 'title': section.title, | ||
| 653 | + 'id': section.chapter_id, | ||
| 654 | + 'role': section.description, | ||
| 655 | + 'target_words': chapter_targets.get(section.chapter_id, {}).get('targetWords', 500), | ||
| 656 | + 'emphasis': chapter_targets.get(section.chapter_id, {}).get('emphasisPoints', '') | ||
| 657 | + } | ||
| 658 | + | ||
| 659 | + graph_results = graphrag_query_node.run( | ||
| 660 | + section_info, | ||
| 661 | + { | ||
| 662 | + 'query': query, | ||
| 663 | + 'template_name': template_result.get('template_name'), | ||
| 664 | + 'chapters': word_plan.get('chapters', []) | ||
| 665 | + }, | ||
| 666 | + knowledge_graph, | ||
| 667 | + max_queries=max_queries | ||
| 668 | + ) | ||
| 669 | + | ||
| 670 | + if graph_results and graph_results.get('total_nodes', 0) > 0: | ||
| 671 | + # 将图谱结果注入生成上下文 | ||
| 672 | + chapter_context['graph_results'] = graph_results | ||
| 673 | + chapter_context['graph_enhancement_prompt'] = format_graph_results_for_prompt(graph_results) | ||
| 674 | + logger.info(f"章节 {section.title} GraphRAG 查询完成: {graph_results.get('total_nodes', 0)} 节点") | ||
| 675 | + except Exception as graph_query_error: | ||
| 676 | + logger.warning(f"GraphRAG 查询失败 ({section.title}): {graph_query_error}") | ||
| 677 | + # ==================== GraphRAG 查询结束 ==================== | ||
| 678 | + | ||
| 597 | while attempt <= chapter_max_attempts: | 679 | while attempt <= chapter_max_attempts: |
| 598 | try: | 680 | try: |
| 599 | chapter_payload = self.chapter_generation_node.run( | 681 | chapter_payload = self.chapter_generation_node.run( |
| 600 | section, | 682 | section, |
| 601 | - generation_context, | 683 | + chapter_context, # 使用包含图谱结果的上下文 |
| 602 | run_dir, | 684 | run_dir, |
| 603 | stream_callback=chunk_callback | 685 | stream_callback=chunk_callback |
| 604 | ) | 686 | ) |
| @@ -796,6 +878,62 @@ class ReportAgent: | @@ -796,6 +878,62 @@ class ReportAgent: | ||
| 796 | self.state.metadata.template_used = fallback_template['template_name'] | 878 | self.state.metadata.template_used = fallback_template['template_name'] |
| 797 | return fallback_template | 879 | return fallback_template |
| 798 | 880 | ||
| 881 | + def _build_knowledge_graph( | ||
| 882 | + self, | ||
| 883 | + query: str, | ||
| 884 | + reports: Dict[str, str], | ||
| 885 | + forum_logs: str, | ||
| 886 | + run_dir: Path | ||
| 887 | + ) -> Optional[Graph]: | ||
| 888 | + """ | ||
| 889 | + 构建知识图谱。 | ||
| 890 | + | ||
| 891 | + 从已加载的 State JSON 和论坛日志中提取结构化数据, | ||
| 892 | + 构建知识图谱供后续章节生成时查询。 | ||
| 893 | + | ||
| 894 | + 参数: | ||
| 895 | + query: 用户查询主题。 | ||
| 896 | + reports: 归一化后的报告映射。 | ||
| 897 | + forum_logs: 论坛日志内容。 | ||
| 898 | + run_dir: 运行目录,用于保存图谱。 | ||
| 899 | + | ||
| 900 | + 返回: | ||
| 901 | + Graph: 构建好的知识图谱;失败返回 None。 | ||
| 902 | + """ | ||
| 903 | + try: | ||
| 904 | + # 解析 State JSON(如果在 load_input_files 时已加载) | ||
| 905 | + states = {} | ||
| 906 | + state_parser = StateParser() | ||
| 907 | + | ||
| 908 | + # 尝试从 reports 目录查找 State JSON | ||
| 909 | + # 注意:这里假设 reports 字典的键对应引擎目录 | ||
| 910 | + for engine in ['insight', 'media', 'query']: | ||
| 911 | + # 尝试从全局状态获取(如果之前已加载) | ||
| 912 | + if hasattr(self, '_loaded_states') and engine in self._loaded_states: | ||
| 913 | + states[engine] = self._loaded_states[engine] | ||
| 914 | + | ||
| 915 | + # 解析论坛日志 | ||
| 916 | + forum_entries = [] | ||
| 917 | + if forum_logs: | ||
| 918 | + forum_parser = ForumParser() | ||
| 919 | + forum_entries = forum_parser.parse(forum_logs) | ||
| 920 | + logger.info(f"解析论坛日志: {len(forum_entries)} 条记录") | ||
| 921 | + | ||
| 922 | + # 构建图谱 | ||
| 923 | + builder = GraphBuilder() | ||
| 924 | + graph = builder.build(query, states, forum_entries) | ||
| 925 | + | ||
| 926 | + # 保存图谱 | ||
| 927 | + storage = GraphStorage() | ||
| 928 | + graph_path = storage.save(graph, self.state.task_id, run_dir) | ||
| 929 | + logger.info(f"知识图谱已保存: {graph_path}") | ||
| 930 | + | ||
| 931 | + return graph | ||
| 932 | + | ||
| 933 | + except Exception as e: | ||
| 934 | + logger.exception(f"构建知识图谱失败: {e}") | ||
| 935 | + return None | ||
| 936 | + | ||
| 799 | def _slice_template(self, template_markdown: str) -> List[TemplateSection]: | 937 | def _slice_template(self, template_markdown: str) -> List[TemplateSection]: |
| 800 | """ | 938 | """ |
| 801 | 将模板切成章节列表,若为空则提供fallback。 | 939 | 将模板切成章节列表,若为空则提供fallback。 |
| @@ -1464,15 +1602,18 @@ class ReportAgent: | @@ -1464,15 +1602,18 @@ class ReportAgent: | ||
| 1464 | file_paths: 文件路径字典 | 1602 | file_paths: 文件路径字典 |
| 1465 | 1603 | ||
| 1466 | Returns: | 1604 | Returns: |
| 1467 | - 加载的内容字典,包含 `reports` 列表与 `forum_logs` 字符串 | 1605 | + 加载的内容字典,包含 `reports` 列表、`forum_logs` 字符串和 `states` 字典 |
| 1468 | """ | 1606 | """ |
| 1469 | content = { | 1607 | content = { |
| 1470 | 'reports': [], | 1608 | 'reports': [], |
| 1471 | - 'forum_logs': '' | 1609 | + 'forum_logs': '', |
| 1610 | + 'states': {} # 新增:用于 GraphRAG 的 State JSON | ||
| 1472 | } | 1611 | } |
| 1473 | 1612 | ||
| 1474 | # 加载报告文件 | 1613 | # 加载报告文件 |
| 1475 | engines = ['query', 'media', 'insight'] | 1614 | engines = ['query', 'media', 'insight'] |
| 1615 | + state_parser = StateParser() | ||
| 1616 | + | ||
| 1476 | for engine in engines: | 1617 | for engine in engines: |
| 1477 | if engine in file_paths: | 1618 | if engine in file_paths: |
| 1478 | try: | 1619 | try: |
| @@ -1480,6 +1621,20 @@ class ReportAgent: | @@ -1480,6 +1621,20 @@ class ReportAgent: | ||
| 1480 | report_content = f.read() | 1621 | report_content = f.read() |
| 1481 | content['reports'].append(report_content) | 1622 | content['reports'].append(report_content) |
| 1482 | logger.info(f"已加载 {engine} 报告: {len(report_content)} 字符") | 1623 | logger.info(f"已加载 {engine} 报告: {len(report_content)} 字符") |
| 1624 | + | ||
| 1625 | + # 新增:尝试查找并加载对应的 State JSON(用于 GraphRAG) | ||
| 1626 | + if self.config.GRAPHRAG_ENABLED: | ||
| 1627 | + state_path = state_parser.find_state_json(file_paths[engine]) | ||
| 1628 | + if state_path: | ||
| 1629 | + parsed_state = state_parser.parse_from_file(engine, state_path) | ||
| 1630 | + if parsed_state: | ||
| 1631 | + content['states'][engine] = parsed_state | ||
| 1632 | + # 同时保存到实例属性,供 _build_knowledge_graph 使用 | ||
| 1633 | + if not hasattr(self, '_loaded_states'): | ||
| 1634 | + self._loaded_states = {} | ||
| 1635 | + self._loaded_states[engine] = parsed_state | ||
| 1636 | + logger.info(f"已加载 {engine} State JSON: {len(parsed_state.sections)} 个段落") | ||
| 1637 | + | ||
| 1483 | except Exception as e: | 1638 | except Exception as e: |
| 1484 | logger.exception(f"加载 {engine} 报告失败: {str(e)}") | 1639 | logger.exception(f"加载 {engine} 报告失败: {str(e)}") |
| 1485 | content['reports'].append("") | 1640 | content['reports'].append("") |
ReportEngine/graphrag/__init__.py
0 → 100644
| 1 | +""" | ||
| 2 | +GraphRAG 知识图谱模块 | ||
| 3 | + | ||
| 4 | +提供基于结构化数据的知识图谱构建、存储与查询功能。 | ||
| 5 | +""" | ||
| 6 | + | ||
| 7 | +from .state_parser import StateParser, ParsedState, ParsedSection, SearchRecord | ||
| 8 | +from .forum_parser import ForumParser, ForumEntry | ||
| 9 | +from .graph_builder import GraphBuilder | ||
| 10 | +from .graph_storage import GraphStorage, Graph, Node, Edge | ||
| 11 | +from .query_engine import QueryEngine, QueryParams, QueryResult | ||
| 12 | + | ||
| 13 | +__all__ = [ | ||
| 14 | + # 解析器 | ||
| 15 | + 'StateParser', | ||
| 16 | + 'ParsedState', | ||
| 17 | + 'ParsedSection', | ||
| 18 | + 'SearchRecord', | ||
| 19 | + 'ForumParser', | ||
| 20 | + 'ForumEntry', | ||
| 21 | + # 图谱核心 | ||
| 22 | + 'GraphBuilder', | ||
| 23 | + 'GraphStorage', | ||
| 24 | + 'Graph', | ||
| 25 | + 'Node', | ||
| 26 | + 'Edge', | ||
| 27 | + # 查询引擎 | ||
| 28 | + 'QueryEngine', | ||
| 29 | + 'QueryParams', | ||
| 30 | + 'QueryResult', | ||
| 31 | +] |
ReportEngine/graphrag/forum_parser.py
0 → 100644
| 1 | +""" | ||
| 2 | +Forum 日志解析器 | ||
| 3 | + | ||
| 4 | +解析 forum.log 文件,提取结构化的讨论记录用于构建知识图谱。 | ||
| 5 | +""" | ||
| 6 | + | ||
| 7 | +from dataclasses import dataclass | ||
| 8 | +from typing import List, Optional | ||
| 9 | +import re | ||
| 10 | + | ||
| 11 | + | ||
| 12 | +@dataclass | ||
| 13 | +class ForumEntry: | ||
| 14 | + """论坛讨论条目""" | ||
| 15 | + timestamp: str | ||
| 16 | + speaker: str | ||
| 17 | + content: str | ||
| 18 | + | ||
| 19 | + @property | ||
| 20 | + def is_host(self) -> bool: | ||
| 21 | + """是否为主持人发言""" | ||
| 22 | + return self.speaker.upper() == 'HOST' | ||
| 23 | + | ||
| 24 | + @property | ||
| 25 | + def is_system(self) -> bool: | ||
| 26 | + """是否为系统消息""" | ||
| 27 | + return self.speaker.upper() == 'SYSTEM' | ||
| 28 | + | ||
| 29 | + @property | ||
| 30 | + def engine_name(self) -> Optional[str]: | ||
| 31 | + """获取对应的引擎名称(小写)""" | ||
| 32 | + speaker_upper = self.speaker.upper() | ||
| 33 | + if speaker_upper in ['INSIGHT', 'MEDIA', 'QUERY', 'HOST']: | ||
| 34 | + return speaker_upper.lower() | ||
| 35 | + return None | ||
| 36 | + | ||
| 37 | + | ||
| 38 | +class ForumParser: | ||
| 39 | + """ | ||
| 40 | + Forum 日志解析器 | ||
| 41 | + | ||
| 42 | + 解析 forum.log,提取结构化的讨论记录。 | ||
| 43 | + 日志格式: [HH:MM:SS] [SPEAKER] content | ||
| 44 | + """ | ||
| 45 | + | ||
| 46 | + # 匹配日志行的正则表达式 | ||
| 47 | + PATTERN = re.compile(r'\[(\d{2}:\d{2}:\d{2})\]\s*\[(\w+)\]\s*(.+)') | ||
| 48 | + | ||
| 49 | + # 有效的发言者 | ||
| 50 | + VALID_SPEAKERS = {'INSIGHT', 'MEDIA', 'QUERY', 'HOST', 'SYSTEM'} | ||
| 51 | + | ||
| 52 | + def parse(self, forum_logs: str) -> List[ForumEntry]: | ||
| 53 | + """ | ||
| 54 | + 解析 forum.log 内容 | ||
| 55 | + | ||
| 56 | + Args: | ||
| 57 | + forum_logs: forum.log 文件内容 | ||
| 58 | + | ||
| 59 | + Returns: | ||
| 60 | + ForumEntry 列表 | ||
| 61 | + """ | ||
| 62 | + if not forum_logs: | ||
| 63 | + return [] | ||
| 64 | + | ||
| 65 | + entries = [] | ||
| 66 | + | ||
| 67 | + for line in forum_logs.strip().split('\n'): | ||
| 68 | + if not line.strip(): | ||
| 69 | + continue | ||
| 70 | + | ||
| 71 | + match = self.PATTERN.match(line) | ||
| 72 | + if match: | ||
| 73 | + timestamp, speaker, content = match.groups() | ||
| 74 | + speaker_upper = speaker.upper() | ||
| 75 | + | ||
| 76 | + if speaker_upper in self.VALID_SPEAKERS: | ||
| 77 | + # 处理转义的换行符 | ||
| 78 | + content = content.replace('\\n', '\n') | ||
| 79 | + | ||
| 80 | + entries.append(ForumEntry( | ||
| 81 | + timestamp=timestamp, | ||
| 82 | + speaker=speaker_upper, | ||
| 83 | + content=content | ||
| 84 | + )) | ||
| 85 | + | ||
| 86 | + return entries | ||
| 87 | + | ||
| 88 | + def get_host_insights(self, entries: List[ForumEntry]) -> List[str]: | ||
| 89 | + """ | ||
| 90 | + 提取 Host(主持人)的发言内容 | ||
| 91 | + | ||
| 92 | + Args: | ||
| 93 | + entries: ForumEntry 列表 | ||
| 94 | + | ||
| 95 | + Returns: | ||
| 96 | + Host 发言内容列表 | ||
| 97 | + """ | ||
| 98 | + return [e.content for e in entries if e.is_host] | ||
| 99 | + | ||
| 100 | + def get_engine_entries(self, entries: List[ForumEntry], | ||
| 101 | + engine: str) -> List[ForumEntry]: | ||
| 102 | + """ | ||
| 103 | + 获取指定引擎的发言 | ||
| 104 | + | ||
| 105 | + Args: | ||
| 106 | + entries: ForumEntry 列表 | ||
| 107 | + engine: 引擎名称 (insight/media/query/host) | ||
| 108 | + | ||
| 109 | + Returns: | ||
| 110 | + 该引擎的 ForumEntry 列表 | ||
| 111 | + """ | ||
| 112 | + engine_upper = engine.upper() | ||
| 113 | + return [e for e in entries if e.speaker == engine_upper] | ||
| 114 | + | ||
| 115 | + def get_summary_by_engine(self, entries: List[ForumEntry]) -> dict: | ||
| 116 | + """ | ||
| 117 | + 按引擎分组统计发言 | ||
| 118 | + | ||
| 119 | + Args: | ||
| 120 | + entries: ForumEntry 列表 | ||
| 121 | + | ||
| 122 | + Returns: | ||
| 123 | + {engine: [contents]} 字典 | ||
| 124 | + """ | ||
| 125 | + result = { | ||
| 126 | + 'insight': [], | ||
| 127 | + 'media': [], | ||
| 128 | + 'query': [], | ||
| 129 | + 'host': [] | ||
| 130 | + } | ||
| 131 | + | ||
| 132 | + for entry in entries: | ||
| 133 | + engine = entry.engine_name | ||
| 134 | + if engine and engine in result: | ||
| 135 | + result[engine].append(entry.content) | ||
| 136 | + | ||
| 137 | + return result | ||
| 138 | + | ||
| 139 | + def extract_key_points(self, entries: List[ForumEntry], | ||
| 140 | + max_points: int = 10) -> List[str]: | ||
| 141 | + """ | ||
| 142 | + 提取关键观点(优先 Host 发言) | ||
| 143 | + | ||
| 144 | + Args: | ||
| 145 | + entries: ForumEntry 列表 | ||
| 146 | + max_points: 最大提取数量 | ||
| 147 | + | ||
| 148 | + Returns: | ||
| 149 | + 关键观点列表 | ||
| 150 | + """ | ||
| 151 | + key_points = [] | ||
| 152 | + | ||
| 153 | + # 优先提取 Host 的发言 | ||
| 154 | + for entry in entries: | ||
| 155 | + if entry.is_host and not entry.is_system: | ||
| 156 | + # 提取前 200 字作为摘要 | ||
| 157 | + summary = entry.content[:200] | ||
| 158 | + if len(entry.content) > 200: | ||
| 159 | + summary += '...' | ||
| 160 | + key_points.append(f"[{entry.speaker}] {summary}") | ||
| 161 | + | ||
| 162 | + if len(key_points) >= max_points: | ||
| 163 | + break | ||
| 164 | + | ||
| 165 | + return key_points |
ReportEngine/graphrag/graph_builder.py
0 → 100644
| 1 | +""" | ||
| 2 | +知识图谱构建器 | ||
| 3 | + | ||
| 4 | +基于结构化的 State JSON 和 Forum 日志构建知识图谱,无需 LLM 提取实体。 | ||
| 5 | +""" | ||
| 6 | + | ||
| 7 | +from typing import Dict, List, Optional | ||
| 8 | +import hashlib | ||
| 9 | + | ||
| 10 | +from .state_parser import ParsedState, ParsedSection | ||
| 11 | +from .forum_parser import ForumEntry | ||
| 12 | +from .graph_storage import Graph, Node | ||
| 13 | + | ||
| 14 | + | ||
| 15 | +class GraphBuilder: | ||
| 16 | + """ | ||
| 17 | + 知识图谱构建器 | ||
| 18 | + | ||
| 19 | + 基于已有的结构化数据(State JSON、Forum 日志)构建图谱, | ||
| 20 | + 无需 LLM 进行实体/关系提取。 | ||
| 21 | + | ||
| 22 | + 节点类型(5种): | ||
| 23 | + - topic: 用户查询主题 | ||
| 24 | + - engine: 四个引擎来源 (insight/media/query/host) | ||
| 25 | + - section: 报告段落/章节 | ||
| 26 | + - search_query: 搜索关键词 | ||
| 27 | + - source: 信息来源 URL | ||
| 28 | + | ||
| 29 | + 关系类型(4种): | ||
| 30 | + - analyzed_by: 主题由引擎分析 (Topic → Engine) | ||
| 31 | + - contains: 引擎包含段落 (Engine → Section) | ||
| 32 | + - searched: 段落执行搜索 (Section → SearchQuery) | ||
| 33 | + - found: 搜索发现来源 (SearchQuery → Source) | ||
| 34 | + """ | ||
| 35 | + | ||
| 36 | + def build(self, topic: str, states: Dict[str, ParsedState], | ||
| 37 | + forum_entries: Optional[List[ForumEntry]] = None) -> Graph: | ||
| 38 | + """ | ||
| 39 | + 构建知识图谱 | ||
| 40 | + | ||
| 41 | + Args: | ||
| 42 | + topic: 用户查询主题 | ||
| 43 | + states: 引擎状态字典 {engine_name: ParsedState} | ||
| 44 | + forum_entries: Forum 日志条目列表 | ||
| 45 | + | ||
| 46 | + Returns: | ||
| 47 | + 构建的 Graph 对象 | ||
| 48 | + """ | ||
| 49 | + graph = Graph() | ||
| 50 | + | ||
| 51 | + # 1. 创建主题节点 | ||
| 52 | + topic_node = graph.add_node( | ||
| 53 | + node_type="topic", | ||
| 54 | + name=topic, | ||
| 55 | + node_id=f"T_{self._hash(topic)}" | ||
| 56 | + ) | ||
| 57 | + | ||
| 58 | + # 2. 处理每个引擎的状态 | ||
| 59 | + for engine_name, state in states.items(): | ||
| 60 | + self._add_engine_nodes(graph, topic_node, engine_name, state) | ||
| 61 | + | ||
| 62 | + # 3. 处理 Forum 日志(添加 Host 节点) | ||
| 63 | + if forum_entries: | ||
| 64 | + self._add_forum_nodes(graph, topic_node, forum_entries) | ||
| 65 | + | ||
| 66 | + return graph | ||
| 67 | + | ||
| 68 | + def _add_engine_nodes(self, graph: Graph, topic_node: Node, | ||
| 69 | + engine_name: str, state: ParsedState) -> None: | ||
| 70 | + """添加引擎相关节点""" | ||
| 71 | + # 创建引擎节点 | ||
| 72 | + engine_node = graph.add_node( | ||
| 73 | + node_type="engine", | ||
| 74 | + name=engine_name, | ||
| 75 | + node_id=engine_name, | ||
| 76 | + report_title=state.report_title, | ||
| 77 | + original_query=state.query | ||
| 78 | + ) | ||
| 79 | + | ||
| 80 | + # Topic → Engine 关系 | ||
| 81 | + graph.add_edge(topic_node, engine_node, "analyzed_by") | ||
| 82 | + | ||
| 83 | + # 处理段落 | ||
| 84 | + for section in state.sections: | ||
| 85 | + self._add_section_nodes(graph, engine_node, engine_name, section) | ||
| 86 | + | ||
| 87 | + def _add_section_nodes(self, graph: Graph, engine_node: Node, | ||
| 88 | + engine_name: str, section: ParsedSection) -> None: | ||
| 89 | + """添加段落相关节点""" | ||
| 90 | + # 创建段落节点 | ||
| 91 | + section_id = f"{engine_name}_S{section.order}" | ||
| 92 | + section_node = graph.add_node( | ||
| 93 | + node_type="section", | ||
| 94 | + name=section.title, | ||
| 95 | + node_id=section_id, | ||
| 96 | + title=section.title, | ||
| 97 | + order=section.order, | ||
| 98 | + summary=section.summary, | ||
| 99 | + engine=engine_name | ||
| 100 | + ) | ||
| 101 | + | ||
| 102 | + # Engine → Section 关系 | ||
| 103 | + graph.add_edge(engine_node, section_node, "contains") | ||
| 104 | + | ||
| 105 | + # 处理搜索历史 | ||
| 106 | + seen_queries = set() # 去重 | ||
| 107 | + for idx, search in enumerate(section.search_history): | ||
| 108 | + if not search.query: | ||
| 109 | + continue | ||
| 110 | + | ||
| 111 | + # 搜索词去重 | ||
| 112 | + query_key = search.query.strip().lower() | ||
| 113 | + if query_key in seen_queries: | ||
| 114 | + continue | ||
| 115 | + seen_queries.add(query_key) | ||
| 116 | + | ||
| 117 | + # 创建搜索词节点 | ||
| 118 | + query_id = f"{section_id}_Q{idx}" | ||
| 119 | + query_node = graph.add_node( | ||
| 120 | + node_type="search_query", | ||
| 121 | + name=search.query[:50], # 截断长查询 | ||
| 122 | + node_id=query_id, | ||
| 123 | + query_text=search.query, | ||
| 124 | + section_ref=section_id, | ||
| 125 | + engine=engine_name | ||
| 126 | + ) | ||
| 127 | + | ||
| 128 | + # Section → SearchQuery 关系 | ||
| 129 | + graph.add_edge(section_node, query_node, "searched") | ||
| 130 | + | ||
| 131 | + # 处理来源 | ||
| 132 | + if search.url: | ||
| 133 | + self._add_source_node(graph, query_node, search) | ||
| 134 | + | ||
| 135 | + def _add_source_node(self, graph: Graph, query_node: Node, | ||
| 136 | + search) -> None: | ||
| 137 | + """添加来源节点""" | ||
| 138 | + # 使用 URL 的哈希作为 ID,避免重复 | ||
| 139 | + source_id = f"SRC_{self._hash(search.url)}" | ||
| 140 | + | ||
| 141 | + # 检查是否已存在 | ||
| 142 | + existing = graph.get_node(source_id) | ||
| 143 | + if existing: | ||
| 144 | + source_node = existing | ||
| 145 | + else: | ||
| 146 | + source_node = graph.add_node( | ||
| 147 | + node_type="source", | ||
| 148 | + name=search.title[:50] if search.title else search.url[:50], | ||
| 149 | + node_id=source_id, | ||
| 150 | + url=search.url, | ||
| 151 | + title=search.title, | ||
| 152 | + preview=search.content[:100] if search.content else '', | ||
| 153 | + score=search.score | ||
| 154 | + ) | ||
| 155 | + | ||
| 156 | + # SearchQuery → Source 关系 | ||
| 157 | + graph.add_edge(query_node, source_node, "found") | ||
| 158 | + | ||
| 159 | + def _add_forum_nodes(self, graph: Graph, topic_node: Node, | ||
| 160 | + entries: List[ForumEntry]) -> None: | ||
| 161 | + """添加 Forum 日志相关节点""" | ||
| 162 | + # 创建 Host 引擎节点(如果不存在) | ||
| 163 | + host_node = graph.get_node('host') | ||
| 164 | + if not host_node: | ||
| 165 | + host_node = graph.add_node( | ||
| 166 | + node_type="engine", | ||
| 167 | + name="host", | ||
| 168 | + node_id="host", | ||
| 169 | + report_title="论坛主持人总结" | ||
| 170 | + ) | ||
| 171 | + graph.add_edge(topic_node, host_node, "analyzed_by") | ||
| 172 | + | ||
| 173 | + # 提取 Host 的关键发言作为 Section | ||
| 174 | + host_entries = [e for e in entries if e.is_host and not e.is_system] | ||
| 175 | + | ||
| 176 | + for idx, entry in enumerate(host_entries[:5]): # 最多取 5 条 | ||
| 177 | + section_id = f"host_S{idx}" | ||
| 178 | + section_node = graph.add_node( | ||
| 179 | + node_type="section", | ||
| 180 | + name=f"主持人总结 {idx + 1}", | ||
| 181 | + node_id=section_id, | ||
| 182 | + title=f"[{entry.timestamp}] 主持人总结", | ||
| 183 | + order=idx, | ||
| 184 | + summary=entry.content[:300], | ||
| 185 | + engine="host", | ||
| 186 | + timestamp=entry.timestamp | ||
| 187 | + ) | ||
| 188 | + | ||
| 189 | + graph.add_edge(host_node, section_node, "contains") | ||
| 190 | + | ||
| 191 | + @staticmethod | ||
| 192 | + def _hash(text: str) -> str: | ||
| 193 | + """生成短哈希""" | ||
| 194 | + return hashlib.md5(text.encode()).hexdigest()[:8] |
ReportEngine/graphrag/graph_storage.py
0 → 100644
| 1 | +""" | ||
| 2 | +知识图谱存储模块 | ||
| 3 | + | ||
| 4 | +定义图谱的核心数据结构(Node、Edge、Graph)及 JSON 存储功能。 | ||
| 5 | +""" | ||
| 6 | + | ||
| 7 | +from dataclasses import dataclass, field | ||
| 8 | +from typing import Dict, Any, List, Optional, Set | ||
| 9 | +from datetime import datetime | ||
| 10 | +import json | ||
| 11 | +from pathlib import Path | ||
| 12 | +import hashlib | ||
| 13 | + | ||
| 14 | + | ||
| 15 | +@dataclass | ||
| 16 | +class Node: | ||
| 17 | + """图谱节点""" | ||
| 18 | + id: str | ||
| 19 | + type: str # topic, engine, section, search_query, source | ||
| 20 | + name: str = "" | ||
| 21 | + attributes: Dict[str, Any] = field(default_factory=dict) | ||
| 22 | + | ||
| 23 | + @property | ||
| 24 | + def label(self) -> str: | ||
| 25 | + """获取显示标签(兼容前端)""" | ||
| 26 | + return self.name | ||
| 27 | + | ||
| 28 | + @property | ||
| 29 | + def properties(self) -> Dict[str, Any]: | ||
| 30 | + """获取属性(兼容前端)""" | ||
| 31 | + return self.attributes | ||
| 32 | + | ||
| 33 | + def to_dict(self) -> Dict[str, Any]: | ||
| 34 | + """转换为字典""" | ||
| 35 | + return { | ||
| 36 | + 'id': self.id, | ||
| 37 | + 'type': self.type, | ||
| 38 | + 'name': self.name, | ||
| 39 | + 'label': self.name, # 兼容字段 | ||
| 40 | + 'attributes': self.attributes, | ||
| 41 | + 'properties': self.attributes # 兼容字段 | ||
| 42 | + } | ||
| 43 | + | ||
| 44 | + @classmethod | ||
| 45 | + def from_dict(cls, data: Dict[str, Any]) -> 'Node': | ||
| 46 | + """从字典创建""" | ||
| 47 | + return cls( | ||
| 48 | + id=data['id'], | ||
| 49 | + type=data['type'], | ||
| 50 | + name=data.get('name', data.get('label', '')), | ||
| 51 | + attributes=data.get('attributes', data.get('properties', {})) | ||
| 52 | + ) | ||
| 53 | + | ||
| 54 | + def get(self, key: str, default: Any = None) -> Any: | ||
| 55 | + """获取属性值""" | ||
| 56 | + if key == 'id': | ||
| 57 | + return self.id | ||
| 58 | + if key == 'type': | ||
| 59 | + return self.type | ||
| 60 | + if key in ('name', 'label'): | ||
| 61 | + return self.name | ||
| 62 | + return self.attributes.get(key, default) | ||
| 63 | + | ||
| 64 | + | ||
| 65 | +@dataclass | ||
| 66 | +class Edge: | ||
| 67 | + """图谱边""" | ||
| 68 | + from_id: str | ||
| 69 | + to_id: str | ||
| 70 | + relation: str # analyzed_by, contains, searched, found | ||
| 71 | + weight: float = 1.0 | ||
| 72 | + attributes: Dict[str, Any] = field(default_factory=dict) | ||
| 73 | + | ||
| 74 | + @property | ||
| 75 | + def source(self) -> str: | ||
| 76 | + """起始节点ID(兼容前端)""" | ||
| 77 | + return self.from_id | ||
| 78 | + | ||
| 79 | + @property | ||
| 80 | + def target(self) -> str: | ||
| 81 | + """目标节点ID(兼容前端)""" | ||
| 82 | + return self.to_id | ||
| 83 | + | ||
| 84 | + def to_dict(self) -> Dict[str, Any]: | ||
| 85 | + """转换为字典""" | ||
| 86 | + return { | ||
| 87 | + 'from': self.from_id, | ||
| 88 | + 'to': self.to_id, | ||
| 89 | + 'source': self.from_id, # 兼容字段 | ||
| 90 | + 'target': self.to_id, # 兼容字段 | ||
| 91 | + 'relation': self.relation, | ||
| 92 | + 'weight': self.weight, | ||
| 93 | + 'attributes': self.attributes | ||
| 94 | + } | ||
| 95 | + | ||
| 96 | + @classmethod | ||
| 97 | + def from_dict(cls, data: Dict[str, Any]) -> 'Edge': | ||
| 98 | + """从字典创建""" | ||
| 99 | + return cls( | ||
| 100 | + from_id=data.get('from', data.get('source', '')), | ||
| 101 | + to_id=data.get('to', data.get('target', '')), | ||
| 102 | + relation=data['relation'], | ||
| 103 | + weight=data.get('weight', 1.0), | ||
| 104 | + attributes=data.get('attributes', {}) | ||
| 105 | + ) | ||
| 106 | + | ||
| 107 | + | ||
| 108 | +class Graph: | ||
| 109 | + """知识图谱""" | ||
| 110 | + | ||
| 111 | + def __init__(self): | ||
| 112 | + self._nodes: Dict[str, Node] = {} | ||
| 113 | + self._edges: List[Edge] = [] | ||
| 114 | + self._adjacency: Dict[str, Set[str]] = {} # 邻接表 | ||
| 115 | + | ||
| 116 | + @property | ||
| 117 | + def nodes(self) -> Dict[str, Node]: | ||
| 118 | + """获取所有节点(字典形式,兼容前端API)""" | ||
| 119 | + return self._nodes | ||
| 120 | + | ||
| 121 | + @property | ||
| 122 | + def node_list(self) -> List[Node]: | ||
| 123 | + """获取所有节点(列表形式)""" | ||
| 124 | + return list(self._nodes.values()) | ||
| 125 | + | ||
| 126 | + @property | ||
| 127 | + def edges(self) -> List[Edge]: | ||
| 128 | + """获取所有边""" | ||
| 129 | + return self._edges | ||
| 130 | + | ||
| 131 | + @property | ||
| 132 | + def node_count(self) -> int: | ||
| 133 | + """节点数量""" | ||
| 134 | + return len(self._nodes) | ||
| 135 | + | ||
| 136 | + @property | ||
| 137 | + def edge_count(self) -> int: | ||
| 138 | + """边数量""" | ||
| 139 | + return len(self._edges) | ||
| 140 | + | ||
| 141 | + def add_node(self, node_type: str, name: str = "", | ||
| 142 | + node_id: Optional[str] = None, **attributes) -> Node: | ||
| 143 | + """ | ||
| 144 | + 添加节点 | ||
| 145 | + | ||
| 146 | + Args: | ||
| 147 | + node_type: 节点类型 | ||
| 148 | + name: 节点名称 | ||
| 149 | + node_id: 节点ID,不提供则自动生成 | ||
| 150 | + **attributes: 其他属性 | ||
| 151 | + | ||
| 152 | + Returns: | ||
| 153 | + 创建的节点 | ||
| 154 | + """ | ||
| 155 | + if node_id is None: | ||
| 156 | + # 基于类型和名称生成ID | ||
| 157 | + hash_input = f"{node_type}_{name}_{len(self._nodes)}" | ||
| 158 | + node_id = f"{node_type[:3].upper()}_{hashlib.md5(hash_input.encode()).hexdigest()[:8]}" | ||
| 159 | + | ||
| 160 | + # 如果已存在,返回现有节点 | ||
| 161 | + if node_id in self._nodes: | ||
| 162 | + return self._nodes[node_id] | ||
| 163 | + | ||
| 164 | + node = Node( | ||
| 165 | + id=node_id, | ||
| 166 | + type=node_type, | ||
| 167 | + name=name, | ||
| 168 | + attributes=attributes | ||
| 169 | + ) | ||
| 170 | + | ||
| 171 | + self._nodes[node_id] = node | ||
| 172 | + self._adjacency[node_id] = set() | ||
| 173 | + | ||
| 174 | + return node | ||
| 175 | + | ||
| 176 | + def get_node(self, node_id: str) -> Optional[Node]: | ||
| 177 | + """获取节点""" | ||
| 178 | + return self._nodes.get(node_id) | ||
| 179 | + | ||
| 180 | + def add_edge(self, from_node: Node, to_node: Node, | ||
| 181 | + relation: str, weight: float = 1.0, **attributes) -> Edge: | ||
| 182 | + """ | ||
| 183 | + 添加边 | ||
| 184 | + | ||
| 185 | + Args: | ||
| 186 | + from_node: 起始节点 | ||
| 187 | + to_node: 目标节点 | ||
| 188 | + relation: 关系类型 | ||
| 189 | + weight: 权重 | ||
| 190 | + **attributes: 其他属性 | ||
| 191 | + | ||
| 192 | + Returns: | ||
| 193 | + 创建的边 | ||
| 194 | + """ | ||
| 195 | + edge = Edge( | ||
| 196 | + from_id=from_node.id, | ||
| 197 | + to_id=to_node.id, | ||
| 198 | + relation=relation, | ||
| 199 | + weight=weight, | ||
| 200 | + attributes=attributes | ||
| 201 | + ) | ||
| 202 | + | ||
| 203 | + self._edges.append(edge) | ||
| 204 | + | ||
| 205 | + # 更新邻接表 | ||
| 206 | + if from_node.id in self._adjacency: | ||
| 207 | + self._adjacency[from_node.id].add(to_node.id) | ||
| 208 | + if to_node.id in self._adjacency: | ||
| 209 | + self._adjacency[to_node.id].add(from_node.id) | ||
| 210 | + | ||
| 211 | + return edge | ||
| 212 | + | ||
| 213 | + def get_neighbors(self, node_id: str) -> List[Node]: | ||
| 214 | + """获取邻居节点""" | ||
| 215 | + neighbor_ids = self._adjacency.get(node_id, set()) | ||
| 216 | + return [self._nodes[nid] for nid in neighbor_ids if nid in self._nodes] | ||
| 217 | + | ||
| 218 | + def get_edges_from(self, node_id: str) -> List[Edge]: | ||
| 219 | + """获取从指定节点出发的边""" | ||
| 220 | + return [e for e in self._edges if e.from_id == node_id] | ||
| 221 | + | ||
| 222 | + def get_edges_to(self, node_id: str) -> List[Edge]: | ||
| 223 | + """获取指向指定节点的边""" | ||
| 224 | + return [e for e in self._edges if e.to_id == node_id] | ||
| 225 | + | ||
| 226 | + def get_nodes_by_type(self, node_type: str) -> List[Node]: | ||
| 227 | + """按类型获取节点""" | ||
| 228 | + return [n for n in self._nodes.values() if n.type == node_type] | ||
| 229 | + | ||
| 230 | + def get_stats(self) -> Dict[str, int]: | ||
| 231 | + """获取图谱统计信息""" | ||
| 232 | + type_counts = {} | ||
| 233 | + for node in self._nodes.values(): | ||
| 234 | + type_counts[node.type] = type_counts.get(node.type, 0) + 1 | ||
| 235 | + | ||
| 236 | + return { | ||
| 237 | + 'total_nodes': self.node_count, | ||
| 238 | + 'total_edges': self.edge_count, | ||
| 239 | + **type_counts | ||
| 240 | + } | ||
| 241 | + | ||
| 242 | + def get_summary(self) -> Dict[str, Any]: | ||
| 243 | + """获取图谱概览(用于提示词)""" | ||
| 244 | + stats = self.get_stats() | ||
| 245 | + | ||
| 246 | + # 获取各类型节点的样例 | ||
| 247 | + section_titles = [n.name for n in self.get_nodes_by_type('section')][:10] | ||
| 248 | + search_queries = [n.get('query_text', n.name) | ||
| 249 | + for n in self.get_nodes_by_type('search_query')][:20] | ||
| 250 | + | ||
| 251 | + return { | ||
| 252 | + 'stats': stats, | ||
| 253 | + 'section_titles': section_titles, | ||
| 254 | + 'sample_queries': search_queries, | ||
| 255 | + 'topic': next((n.name for n in self.get_nodes_by_type('topic')), ''), | ||
| 256 | + 'engines': [n.name for n in self.get_nodes_by_type('engine')] | ||
| 257 | + } | ||
| 258 | + | ||
| 259 | + def to_dict(self) -> Dict[str, Any]: | ||
| 260 | + """转换为字典""" | ||
| 261 | + return { | ||
| 262 | + 'nodes': [n.to_dict() for n in self.node_list], | ||
| 263 | + 'edges': [e.to_dict() for e in self.edges], | ||
| 264 | + 'stats': self.get_stats() | ||
| 265 | + } | ||
| 266 | + | ||
| 267 | + @classmethod | ||
| 268 | + def from_dict(cls, data: Dict[str, Any]) -> 'Graph': | ||
| 269 | + """从字典创建""" | ||
| 270 | + graph = cls() | ||
| 271 | + | ||
| 272 | + # 添加节点 | ||
| 273 | + for node_data in data.get('nodes', []): | ||
| 274 | + node = Node.from_dict(node_data) | ||
| 275 | + graph._nodes[node.id] = node | ||
| 276 | + graph._adjacency[node.id] = set() | ||
| 277 | + | ||
| 278 | + # 添加边 | ||
| 279 | + for edge_data in data.get('edges', []): | ||
| 280 | + edge = Edge.from_dict(edge_data) | ||
| 281 | + graph._edges.append(edge) | ||
| 282 | + # 更新邻接表 | ||
| 283 | + if edge.from_id in graph._adjacency: | ||
| 284 | + graph._adjacency[edge.from_id].add(edge.to_id) | ||
| 285 | + if edge.to_id in graph._adjacency: | ||
| 286 | + graph._adjacency[edge.to_id].add(edge.from_id) | ||
| 287 | + | ||
| 288 | + return graph | ||
| 289 | + | ||
| 290 | + | ||
| 291 | +class GraphStorage: | ||
| 292 | + """图谱存储管理器""" | ||
| 293 | + | ||
| 294 | + FILENAME = "graphrag.json" | ||
| 295 | + DEFAULT_CHAPTERS_DIR = Path("chapters") | ||
| 296 | + | ||
| 297 | + def save(self, graph: Graph, task_id: str, run_dir: Path) -> Path: | ||
| 298 | + """ | ||
| 299 | + 保存图谱到 JSON 文件 | ||
| 300 | + | ||
| 301 | + Args: | ||
| 302 | + graph: 图谱对象 | ||
| 303 | + task_id: 任务ID | ||
| 304 | + run_dir: 运行目录 | ||
| 305 | + | ||
| 306 | + Returns: | ||
| 307 | + 保存的文件路径 | ||
| 308 | + """ | ||
| 309 | + run_dir = Path(run_dir) | ||
| 310 | + run_dir.mkdir(parents=True, exist_ok=True) | ||
| 311 | + | ||
| 312 | + output = { | ||
| 313 | + 'task_id': task_id, | ||
| 314 | + 'created_at': datetime.now().isoformat(), | ||
| 315 | + **graph.to_dict() | ||
| 316 | + } | ||
| 317 | + | ||
| 318 | + file_path = run_dir / self.FILENAME | ||
| 319 | + with open(file_path, 'w', encoding='utf-8') as f: | ||
| 320 | + json.dump(output, f, ensure_ascii=False, indent=2) | ||
| 321 | + | ||
| 322 | + return file_path | ||
| 323 | + | ||
| 324 | + def load(self, path: Path) -> Optional[Graph]: | ||
| 325 | + """ | ||
| 326 | + 从 JSON 文件加载图谱 | ||
| 327 | + | ||
| 328 | + Args: | ||
| 329 | + path: 文件路径或运行目录 | ||
| 330 | + | ||
| 331 | + Returns: | ||
| 332 | + Graph 对象,失败返回 None | ||
| 333 | + """ | ||
| 334 | + path = Path(path) | ||
| 335 | + | ||
| 336 | + # 如果是目录,添加文件名 | ||
| 337 | + if path.is_dir(): | ||
| 338 | + file_path = path / self.FILENAME | ||
| 339 | + else: | ||
| 340 | + file_path = path | ||
| 341 | + | ||
| 342 | + if not file_path.exists(): | ||
| 343 | + return None | ||
| 344 | + | ||
| 345 | + try: | ||
| 346 | + with open(file_path, 'r', encoding='utf-8') as f: | ||
| 347 | + data = json.load(f) | ||
| 348 | + return Graph.from_dict(data) | ||
| 349 | + except Exception: | ||
| 350 | + return None | ||
| 351 | + | ||
| 352 | + def exists(self, run_dir: Path) -> bool: | ||
| 353 | + """检查图谱文件是否存在""" | ||
| 354 | + return (Path(run_dir) / self.FILENAME).exists() | ||
| 355 | + | ||
| 356 | + def find_graph_by_report_id(self, report_id: str) -> Optional[Path]: | ||
| 357 | + """ | ||
| 358 | + 根据报告ID查找图谱文件 | ||
| 359 | + | ||
| 360 | + Args: | ||
| 361 | + report_id: 报告ID | ||
| 362 | + | ||
| 363 | + Returns: | ||
| 364 | + 图谱文件路径,未找到返回 None | ||
| 365 | + """ | ||
| 366 | + # 在默认目录中搜索 | ||
| 367 | + chapters_dir = self.DEFAULT_CHAPTERS_DIR | ||
| 368 | + if not chapters_dir.exists(): | ||
| 369 | + return None | ||
| 370 | + | ||
| 371 | + # 查找匹配报告ID的目录 | ||
| 372 | + for run_dir in chapters_dir.iterdir(): | ||
| 373 | + if not run_dir.is_dir(): | ||
| 374 | + continue | ||
| 375 | + | ||
| 376 | + # 检查目录名是否包含报告ID | ||
| 377 | + if report_id in run_dir.name: | ||
| 378 | + graph_path = run_dir / self.FILENAME | ||
| 379 | + if graph_path.exists(): | ||
| 380 | + return graph_path | ||
| 381 | + | ||
| 382 | + return None | ||
| 383 | + | ||
| 384 | + def find_latest_graph(self) -> Optional[Path]: | ||
| 385 | + """ | ||
| 386 | + 查找最新的图谱文件 | ||
| 387 | + | ||
| 388 | + Returns: | ||
| 389 | + 最新图谱文件路径,未找到返回 None | ||
| 390 | + """ | ||
| 391 | + chapters_dir = self.DEFAULT_CHAPTERS_DIR | ||
| 392 | + if not chapters_dir.exists(): | ||
| 393 | + return None | ||
| 394 | + | ||
| 395 | + latest_path = None | ||
| 396 | + latest_time = None | ||
| 397 | + | ||
| 398 | + # 遍历所有运行目录 | ||
| 399 | + for run_dir in chapters_dir.iterdir(): | ||
| 400 | + if not run_dir.is_dir(): | ||
| 401 | + continue | ||
| 402 | + | ||
| 403 | + graph_path = run_dir / self.FILENAME | ||
| 404 | + if graph_path.exists(): | ||
| 405 | + mtime = graph_path.stat().st_mtime | ||
| 406 | + if latest_time is None or mtime > latest_time: | ||
| 407 | + latest_time = mtime | ||
| 408 | + latest_path = graph_path | ||
| 409 | + | ||
| 410 | + return latest_path | ||
| 411 | + | ||
| 412 | + def list_all_graphs(self) -> List[Dict[str, Any]]: | ||
| 413 | + """ | ||
| 414 | + 列出所有可用的图谱 | ||
| 415 | + | ||
| 416 | + Returns: | ||
| 417 | + 图谱信息列表,包含路径、报告ID、创建时间等 | ||
| 418 | + """ | ||
| 419 | + chapters_dir = self.DEFAULT_CHAPTERS_DIR | ||
| 420 | + if not chapters_dir.exists(): | ||
| 421 | + return [] | ||
| 422 | + | ||
| 423 | + graphs = [] | ||
| 424 | + for run_dir in chapters_dir.iterdir(): | ||
| 425 | + if not run_dir.is_dir(): | ||
| 426 | + continue | ||
| 427 | + | ||
| 428 | + graph_path = run_dir / self.FILENAME | ||
| 429 | + if graph_path.exists(): | ||
| 430 | + try: | ||
| 431 | + with open(graph_path, 'r', encoding='utf-8') as f: | ||
| 432 | + data = json.load(f) | ||
| 433 | + | ||
| 434 | + graphs.append({ | ||
| 435 | + 'path': str(graph_path), | ||
| 436 | + 'report_id': data.get('task_id', run_dir.name), | ||
| 437 | + 'created_at': data.get('created_at'), | ||
| 438 | + 'stats': data.get('stats', {}), | ||
| 439 | + 'dir_name': run_dir.name | ||
| 440 | + }) | ||
| 441 | + except Exception: | ||
| 442 | + continue | ||
| 443 | + | ||
| 444 | + # 按创建时间排序 | ||
| 445 | + graphs.sort(key=lambda x: x.get('created_at', ''), reverse=True) | ||
| 446 | + return graphs |
ReportEngine/graphrag/prompts.py
0 → 100644
| 1 | +""" | ||
| 2 | +GraphRAG 提示词模块 | ||
| 3 | + | ||
| 4 | +包含查询决策和章节增强的完整提示词定义。 | ||
| 5 | +""" | ||
| 6 | + | ||
| 7 | +# ================== 查询决策提示词 ================== | ||
| 8 | + | ||
| 9 | +GRAPHRAG_QUERY_DECISION_SYSTEM = """你是一个智能舆情分析助手,负责决定如何查询知识图谱以获取生成报告章节所需的信息。 | ||
| 10 | + | ||
| 11 | +知识图谱包含以下节点类型: | ||
| 12 | +- Topic: 用户查询的主题 | ||
| 13 | +- Engine: 四个分析引擎(Insight/Media/Query/Host) | ||
| 14 | +- Section: 各引擎报告的段落章节 | ||
| 15 | +- SearchQuery: 引擎执行过的搜索关键词 | ||
| 16 | +- Source: 搜索发现的信息来源(URL、标题、内容摘要) | ||
| 17 | + | ||
| 18 | +你的任务是根据当前章节的需求,决定查询参数以获取最相关的信息。""" | ||
| 19 | + | ||
| 20 | +GRAPHRAG_QUERY_DECISION_USER = """ | ||
| 21 | +=== 当前任务 === | ||
| 22 | +正在生成报告章节: "{chapter_title}" | ||
| 23 | +章节编号: {chapter_id} | ||
| 24 | +章节在模板中的定位: {chapter_role} | ||
| 25 | +目标字数: {target_words}字 | ||
| 26 | +章节要点: {chapter_emphasis} | ||
| 27 | + | ||
| 28 | +=== 完整报告规划 === | ||
| 29 | +报告主题: {report_topic} | ||
| 30 | +模板类型: {template_name} | ||
| 31 | +全书章节概览: | ||
| 32 | +{chapters_overview} | ||
| 33 | + | ||
| 34 | +=== 知识图谱概览 === | ||
| 35 | +图谱统计: | ||
| 36 | +- 主题节点: 1个 ({topic_name}) | ||
| 37 | +- 引擎节点: {engine_count}个 | ||
| 38 | +- 段落节点: {section_count}个 | ||
| 39 | +- 搜索词节点: {query_count}个 | ||
| 40 | +- 来源节点: {source_count}个 | ||
| 41 | + | ||
| 42 | +各引擎段落标题: | ||
| 43 | +{section_titles_by_engine} | ||
| 44 | + | ||
| 45 | +搜索关键词样例(前20个): | ||
| 46 | +{sample_search_queries} | ||
| 47 | + | ||
| 48 | +=== 查询历史记录(本章节已执行的查询) === | ||
| 49 | +{query_history_detail} | ||
| 50 | + | ||
| 51 | +=== 请决定查询参数 === | ||
| 52 | +请输出JSON格式的查询参数: | ||
| 53 | +```json | ||
| 54 | +{{ | ||
| 55 | + "should_query": true/false, | ||
| 56 | + "keywords": ["关键词1", "关键词2", ...], | ||
| 57 | + "node_types": ["section", "search_query", "source"], | ||
| 58 | + "engine_filter": ["insight", "media", "query", "host"], | ||
| 59 | + "depth": 1-3, | ||
| 60 | + "reasoning": "选择这些参数的原因,以及期望获取什么信息" | ||
| 61 | +}} | ||
| 62 | +``` | ||
| 63 | + | ||
| 64 | +注意事项: | ||
| 65 | +1. 仔细查看查询历史,**避免重复查询相同或相似的关键词** | ||
| 66 | +2. 关键词应与当前章节主题紧密相关 | ||
| 67 | +3. 如果查询历史已经覆盖了章节所需的主要信息,设置 should_query=false | ||
| 68 | +4. depth建议:1=精确匹配,2=包含关联,3=扩展探索(信息量大但可能有噪音) | ||
| 69 | +5. 可以通过 engine_filter 聚焦特定引擎的分析视角 | ||
| 70 | +""" | ||
| 71 | + | ||
| 72 | +# ================== 章节增强提示词(GraphRAG 开启时使用) ================== | ||
| 73 | + | ||
| 74 | +SYSTEM_PROMPT_CHAPTER_GRAPH_ENHANCEMENT = """ | ||
| 75 | +=== GraphRAG 知识图谱增强 === | ||
| 76 | +本次章节生成已通过知识图谱查询获取了跨引擎的关联信息。 | ||
| 77 | +在生成内容时,请特别注意: | ||
| 78 | + | ||
| 79 | +1. **跨引擎关联**: graphResults 中包含了来自不同引擎的相关信息, | ||
| 80 | + 请综合利用这些多视角的分析结果,形成更全面的观点。 | ||
| 81 | + | ||
| 82 | +2. **信息溯源**: 对于重要观点,可以引用 graphResults.matched_sources | ||
| 83 | + 中的来源信息,增强可信度。 | ||
| 84 | + | ||
| 85 | +3. **搜索词关联**: graphResults.matched_queries 显示了各引擎为本主题 | ||
| 86 | + 执行的相关搜索,这些搜索词本身就是重要的语义线索。 | ||
| 87 | + | ||
| 88 | +4. **避免重复**: 不同引擎可能有相似的分析,请整合而非重复。 | ||
| 89 | +""" | ||
| 90 | + | ||
| 91 | +USER_PROMPT_GRAPH_RESULTS_TEMPLATE = """ | ||
| 92 | +=== GraphRAG 知识图谱查询结果 === | ||
| 93 | + | ||
| 94 | +**查询轮次**: {query_rounds}次 | ||
| 95 | + | ||
| 96 | +**匹配的相关段落** (来自其他引擎的相关分析): | ||
| 97 | +{matched_sections} | ||
| 98 | + | ||
| 99 | +**相关搜索关键词** (各引擎执行的相关搜索): | ||
| 100 | +{matched_queries} | ||
| 101 | + | ||
| 102 | +**相关信息来源** (搜索发现的相关URL和内容): | ||
| 103 | +{matched_sources} | ||
| 104 | + | ||
| 105 | +**跨引擎关联洞察**: | ||
| 106 | +{cross_engine_insights} | ||
| 107 | + | ||
| 108 | +请在生成本章节时,充分利用以上知识图谱查询结果, | ||
| 109 | +特别是跨引擎的关联信息,以丰富内容的多维度分析。 | ||
| 110 | +=== | ||
| 111 | + | ||
| 112 | +""" | ||
| 113 | + | ||
| 114 | + | ||
| 115 | +def format_graph_results_for_prompt(graph_results: dict) -> str: | ||
| 116 | + """ | ||
| 117 | + 格式化 GraphRAG 查询结果用于提示词 | ||
| 118 | + | ||
| 119 | + Args: | ||
| 120 | + graph_results: 查询结果字典 | ||
| 121 | + | ||
| 122 | + Returns: | ||
| 123 | + 格式化的字符串 | ||
| 124 | + """ | ||
| 125 | + if not graph_results: | ||
| 126 | + return "" | ||
| 127 | + | ||
| 128 | + # 格式化段落 | ||
| 129 | + matched_sections = graph_results.get('matched_sections', []) | ||
| 130 | + sections_text = _format_matched_sections(matched_sections) | ||
| 131 | + | ||
| 132 | + # 格式化搜索词 | ||
| 133 | + matched_queries = graph_results.get('matched_queries', []) | ||
| 134 | + queries_text = _format_matched_queries(matched_queries) | ||
| 135 | + | ||
| 136 | + # 格式化来源 | ||
| 137 | + matched_sources = graph_results.get('matched_sources', []) | ||
| 138 | + sources_text = _format_matched_sources(matched_sources) | ||
| 139 | + | ||
| 140 | + # 跨引擎洞察 | ||
| 141 | + insights = graph_results.get('cross_engine_insights', []) | ||
| 142 | + insights_text = _format_cross_engine_insights(insights) | ||
| 143 | + | ||
| 144 | + return USER_PROMPT_GRAPH_RESULTS_TEMPLATE.format( | ||
| 145 | + query_rounds=graph_results.get('query_rounds', 0), | ||
| 146 | + matched_sections=sections_text, | ||
| 147 | + matched_queries=queries_text, | ||
| 148 | + matched_sources=sources_text, | ||
| 149 | + cross_engine_insights=insights_text | ||
| 150 | + ) | ||
| 151 | + | ||
| 152 | + | ||
| 153 | +def _format_matched_sections(sections: list) -> str: | ||
| 154 | + """格式化匹配的段落""" | ||
| 155 | + if not sections: | ||
| 156 | + return "(无匹配段落)" | ||
| 157 | + | ||
| 158 | + lines = [] | ||
| 159 | + for s in sections[:10]: # 限制数量 | ||
| 160 | + engine = s.get('engine', 'unknown') | ||
| 161 | + title = s.get('title', '未知标题') | ||
| 162 | + summary = s.get('summary', '')[:100] | ||
| 163 | + lines.append(f"- [{engine}] {title}: {summary}...") | ||
| 164 | + | ||
| 165 | + return "\n".join(lines) | ||
| 166 | + | ||
| 167 | + | ||
| 168 | +def _format_matched_queries(queries: list) -> str: | ||
| 169 | + """格式化匹配的搜索词""" | ||
| 170 | + if not queries: | ||
| 171 | + return "(无匹配搜索词)" | ||
| 172 | + | ||
| 173 | + by_engine = {} | ||
| 174 | + for q in queries: | ||
| 175 | + engine = q.get('engine', 'unknown') | ||
| 176 | + if engine not in by_engine: | ||
| 177 | + by_engine[engine] = [] | ||
| 178 | + query_text = q.get('query_text', q.get('name', '')) | ||
| 179 | + if query_text and query_text not in by_engine[engine]: | ||
| 180 | + by_engine[engine].append(query_text) | ||
| 181 | + | ||
| 182 | + lines = [] | ||
| 183 | + for engine, query_list in by_engine.items(): | ||
| 184 | + lines.append(f"- {engine}: {', '.join(query_list[:5])}") | ||
| 185 | + | ||
| 186 | + return "\n".join(lines) | ||
| 187 | + | ||
| 188 | + | ||
| 189 | +def _format_matched_sources(sources: list) -> str: | ||
| 190 | + """格式化匹配的来源""" | ||
| 191 | + if not sources: | ||
| 192 | + return "(无匹配来源)" | ||
| 193 | + | ||
| 194 | + lines = [] | ||
| 195 | + for s in sources[:8]: | ||
| 196 | + title = s.get('title', '未知标题') | ||
| 197 | + url = s.get('url', '#') | ||
| 198 | + preview = s.get('preview', '') | ||
| 199 | + lines.append(f"- [{title}]({url})") | ||
| 200 | + if preview: | ||
| 201 | + lines.append(f" 摘要: {preview[:80]}...") | ||
| 202 | + | ||
| 203 | + return "\n".join(lines) | ||
| 204 | + | ||
| 205 | + | ||
| 206 | +def _format_cross_engine_insights(insights: list) -> str: | ||
| 207 | + """格式化跨引擎洞察""" | ||
| 208 | + if not insights: | ||
| 209 | + return "(无跨引擎关联发现)" | ||
| 210 | + | ||
| 211 | + return "\n".join([f"- {insight}" for insight in insights[:5]]) |
ReportEngine/graphrag/query_engine.py
0 → 100644
| 1 | +""" | ||
| 2 | +图查询引擎 | ||
| 3 | + | ||
| 4 | +支持基于关键词、节点类型、引擎来源和深度的知识图谱查询。 | ||
| 5 | +""" | ||
| 6 | + | ||
| 7 | +from dataclasses import dataclass, field | ||
| 8 | +from typing import Dict, Any, List, Optional, Set | ||
| 9 | + | ||
| 10 | +from .graph_storage import Graph, Node | ||
| 11 | + | ||
| 12 | + | ||
| 13 | +@dataclass | ||
| 14 | +class QueryParams: | ||
| 15 | + """查询参数""" | ||
| 16 | + keywords: List[str] = field(default_factory=list) | ||
| 17 | + node_types: Optional[List[str]] = None # None 表示全部类型 | ||
| 18 | + engine_filter: Optional[List[str]] = None # 限定引擎来源 | ||
| 19 | + depth: int = 1 # 扩展深度 | ||
| 20 | + | ||
| 21 | + | ||
| 22 | +@dataclass | ||
| 23 | +class QueryResult: | ||
| 24 | + """查询结果""" | ||
| 25 | + matched_sections: List[Dict[str, Any]] = field(default_factory=list) | ||
| 26 | + matched_queries: List[Dict[str, Any]] = field(default_factory=list) | ||
| 27 | + matched_sources: List[Dict[str, Any]] = field(default_factory=list) | ||
| 28 | + total_nodes: int = 0 | ||
| 29 | + query_params: Optional[Dict[str, Any]] = None | ||
| 30 | + | ||
| 31 | + def to_dict(self) -> Dict[str, Any]: | ||
| 32 | + """转换为字典""" | ||
| 33 | + return { | ||
| 34 | + 'matched_sections': self.matched_sections, | ||
| 35 | + 'matched_queries': self.matched_queries, | ||
| 36 | + 'matched_sources': self.matched_sources, | ||
| 37 | + 'total_nodes': self.total_nodes, | ||
| 38 | + 'query_params': self.query_params | ||
| 39 | + } | ||
| 40 | + | ||
| 41 | + def get_summary(self, max_length: int = 200) -> str: | ||
| 42 | + """获取结果摘要""" | ||
| 43 | + parts = [] | ||
| 44 | + | ||
| 45 | + if self.matched_sections: | ||
| 46 | + section_titles = [s.get('title', '')[:30] for s in self.matched_sections[:3]] | ||
| 47 | + parts.append(f"段落({len(self.matched_sections)}): {', '.join(section_titles)}") | ||
| 48 | + | ||
| 49 | + if self.matched_queries: | ||
| 50 | + query_texts = [q.get('query_text', '')[:20] for q in self.matched_queries[:3]] | ||
| 51 | + parts.append(f"搜索词({len(self.matched_queries)}): {', '.join(query_texts)}") | ||
| 52 | + | ||
| 53 | + if self.matched_sources: | ||
| 54 | + parts.append(f"来源({len(self.matched_sources)})") | ||
| 55 | + | ||
| 56 | + summary = "; ".join(parts) if parts else "无匹配结果" | ||
| 57 | + return summary[:max_length] | ||
| 58 | + | ||
| 59 | + | ||
| 60 | +class QueryEngine: | ||
| 61 | + """ | ||
| 62 | + 图查询引擎 | ||
| 63 | + | ||
| 64 | + 支持以下查询能力: | ||
| 65 | + 1. 关键词匹配:在节点名称和属性中搜索 | ||
| 66 | + 2. 类型筛选:限定节点类型 (section/search_query/source) | ||
| 67 | + 3. 引擎筛选:限定来源引擎 (insight/media/query/host) | ||
| 68 | + 4. 深度扩展:从匹配节点向外扩展指定深度 | ||
| 69 | + """ | ||
| 70 | + | ||
| 71 | + def __init__(self, graph: Graph): | ||
| 72 | + """ | ||
| 73 | + 初始化查询引擎 | ||
| 74 | + | ||
| 75 | + Args: | ||
| 76 | + graph: 知识图谱对象 | ||
| 77 | + """ | ||
| 78 | + self.graph = graph | ||
| 79 | + | ||
| 80 | + def query(self, params: QueryParams) -> QueryResult: | ||
| 81 | + """ | ||
| 82 | + 执行图谱查询 | ||
| 83 | + | ||
| 84 | + Args: | ||
| 85 | + params: 查询参数 | ||
| 86 | + | ||
| 87 | + Returns: | ||
| 88 | + QueryResult 查询结果 | ||
| 89 | + """ | ||
| 90 | + # 1. 关键词匹配获取初始节点 | ||
| 91 | + matched_nodes = self._match_keywords(params) | ||
| 92 | + | ||
| 93 | + # 2. 深度扩展 | ||
| 94 | + if params.depth > 0 and matched_nodes: | ||
| 95 | + expanded_nodes = self._expand_depth(matched_nodes, params.depth) | ||
| 96 | + matched_nodes = matched_nodes.union(expanded_nodes) | ||
| 97 | + | ||
| 98 | + # 3. 整理结果 | ||
| 99 | + result = self._organize_results(matched_nodes, params) | ||
| 100 | + | ||
| 101 | + return result | ||
| 102 | + | ||
| 103 | + def _match_keywords(self, params: QueryParams) -> Set[str]: | ||
| 104 | + """关键词匹配""" | ||
| 105 | + matched_ids = set() | ||
| 106 | + | ||
| 107 | + for node in self.graph.nodes: | ||
| 108 | + # 类型筛选 | ||
| 109 | + if params.node_types and node.type not in params.node_types: | ||
| 110 | + continue | ||
| 111 | + | ||
| 112 | + # 引擎筛选 | ||
| 113 | + if params.engine_filter: | ||
| 114 | + node_engine = node.get('engine') | ||
| 115 | + if node_engine and node_engine not in params.engine_filter: | ||
| 116 | + continue | ||
| 117 | + | ||
| 118 | + # 关键词匹配 | ||
| 119 | + if self._matches_keywords(node, params.keywords): | ||
| 120 | + matched_ids.add(node.id) | ||
| 121 | + | ||
| 122 | + return matched_ids | ||
| 123 | + | ||
| 124 | + def _matches_keywords(self, node: Node, keywords: List[str]) -> bool: | ||
| 125 | + """检查节点是否匹配关键词""" | ||
| 126 | + if not keywords: | ||
| 127 | + return True # 无关键词时全部匹配 | ||
| 128 | + | ||
| 129 | + # 构建搜索文本 | ||
| 130 | + search_text = f"{node.name} {node.get('title', '')} {node.get('query_text', '')} {node.get('summary', '')}" | ||
| 131 | + search_text = search_text.lower() | ||
| 132 | + | ||
| 133 | + # 任一关键词匹配即可 | ||
| 134 | + for keyword in keywords: | ||
| 135 | + if keyword.lower() in search_text: | ||
| 136 | + return True | ||
| 137 | + | ||
| 138 | + return False | ||
| 139 | + | ||
| 140 | + def _expand_depth(self, node_ids: Set[str], depth: int) -> Set[str]: | ||
| 141 | + """从匹配节点向外扩展指定深度""" | ||
| 142 | + expanded = set() | ||
| 143 | + current_layer = node_ids.copy() | ||
| 144 | + | ||
| 145 | + for _ in range(depth): | ||
| 146 | + next_layer = set() | ||
| 147 | + | ||
| 148 | + for node_id in current_layer: | ||
| 149 | + # 获取邻居节点 | ||
| 150 | + neighbors = self.graph.get_neighbors(node_id) | ||
| 151 | + for neighbor in neighbors: | ||
| 152 | + if neighbor.id not in node_ids and neighbor.id not in expanded: | ||
| 153 | + next_layer.add(neighbor.id) | ||
| 154 | + expanded.add(neighbor.id) | ||
| 155 | + | ||
| 156 | + if not next_layer: | ||
| 157 | + break | ||
| 158 | + | ||
| 159 | + current_layer = next_layer | ||
| 160 | + | ||
| 161 | + return expanded | ||
| 162 | + | ||
| 163 | + def _organize_results(self, node_ids: Set[str], | ||
| 164 | + params: QueryParams) -> QueryResult: | ||
| 165 | + """整理查询结果""" | ||
| 166 | + matched_sections = [] | ||
| 167 | + matched_queries = [] | ||
| 168 | + matched_sources = [] | ||
| 169 | + | ||
| 170 | + for node_id in node_ids: | ||
| 171 | + node = self.graph.get_node(node_id) | ||
| 172 | + if not node: | ||
| 173 | + continue | ||
| 174 | + | ||
| 175 | + node_dict = { | ||
| 176 | + 'id': node.id, | ||
| 177 | + 'name': node.name, | ||
| 178 | + 'type': node.type, | ||
| 179 | + **node.attributes | ||
| 180 | + } | ||
| 181 | + | ||
| 182 | + if node.type == 'section': | ||
| 183 | + matched_sections.append(node_dict) | ||
| 184 | + elif node.type == 'search_query': | ||
| 185 | + matched_queries.append(node_dict) | ||
| 186 | + elif node.type == 'source': | ||
| 187 | + matched_sources.append(node_dict) | ||
| 188 | + | ||
| 189 | + # 排序:段落按 order,其他按名称 | ||
| 190 | + matched_sections.sort(key=lambda x: x.get('order', 0)) | ||
| 191 | + matched_queries.sort(key=lambda x: x.get('query_text', '')) | ||
| 192 | + matched_sources.sort(key=lambda x: x.get('title', '')) | ||
| 193 | + | ||
| 194 | + return QueryResult( | ||
| 195 | + matched_sections=matched_sections, | ||
| 196 | + matched_queries=matched_queries, | ||
| 197 | + matched_sources=matched_sources, | ||
| 198 | + total_nodes=len(node_ids), | ||
| 199 | + query_params={ | ||
| 200 | + 'keywords': params.keywords, | ||
| 201 | + 'node_types': params.node_types, | ||
| 202 | + 'engine_filter': params.engine_filter, | ||
| 203 | + 'depth': params.depth | ||
| 204 | + } | ||
| 205 | + ) | ||
| 206 | + | ||
| 207 | + def get_node_summary(self) -> Dict[str, Any]: | ||
| 208 | + """获取图谱节点概览(用于提示词)""" | ||
| 209 | + return self.graph.get_summary() | ||
| 210 | + | ||
| 211 | + def get_section_titles_by_engine(self) -> Dict[str, List[str]]: | ||
| 212 | + """按引擎获取所有段落标题""" | ||
| 213 | + result = {} | ||
| 214 | + | ||
| 215 | + for node in self.graph.get_nodes_by_type('section'): | ||
| 216 | + engine = node.get('engine', 'unknown') | ||
| 217 | + if engine not in result: | ||
| 218 | + result[engine] = [] | ||
| 219 | + result[engine].append(node.get('title', node.name)) | ||
| 220 | + | ||
| 221 | + return result | ||
| 222 | + | ||
| 223 | + def get_sample_search_queries(self, limit: int = 20) -> List[str]: | ||
| 224 | + """获取搜索词样例""" | ||
| 225 | + queries = [] | ||
| 226 | + | ||
| 227 | + for node in self.graph.get_nodes_by_type('search_query'): | ||
| 228 | + query_text = node.get('query_text', node.name) | ||
| 229 | + if query_text and query_text not in queries: | ||
| 230 | + queries.append(query_text) | ||
| 231 | + if len(queries) >= limit: | ||
| 232 | + break | ||
| 233 | + | ||
| 234 | + return queries |
ReportEngine/graphrag/state_parser.py
0 → 100644
| 1 | +""" | ||
| 2 | +State JSON 解析器 | ||
| 3 | + | ||
| 4 | +解析 Insight/Media/Query 三引擎的 State JSON 文件, | ||
| 5 | +提取结构化数据用于构建知识图谱。 | ||
| 6 | +""" | ||
| 7 | + | ||
| 8 | +from dataclasses import dataclass, field | ||
| 9 | +from typing import Dict, Any, List, Optional | ||
| 10 | +import json | ||
| 11 | +from pathlib import Path | ||
| 12 | + | ||
| 13 | + | ||
| 14 | +@dataclass | ||
| 15 | +class SearchRecord: | ||
| 16 | + """单条搜索记录""" | ||
| 17 | + query: str = "" | ||
| 18 | + url: str = "" | ||
| 19 | + title: str = "" | ||
| 20 | + content: str = "" | ||
| 21 | + score: Optional[float] = None | ||
| 22 | + timestamp: str = "" | ||
| 23 | + | ||
| 24 | + | ||
| 25 | +@dataclass | ||
| 26 | +class ParsedSection: | ||
| 27 | + """解析后的段落/章节""" | ||
| 28 | + title: str = "" | ||
| 29 | + order: int = 0 | ||
| 30 | + summary: str = "" | ||
| 31 | + search_history: List[SearchRecord] = field(default_factory=list) | ||
| 32 | + | ||
| 33 | + | ||
| 34 | +@dataclass | ||
| 35 | +class ParsedState: | ||
| 36 | + """解析后的引擎状态""" | ||
| 37 | + engine: str = "" | ||
| 38 | + query: str = "" | ||
| 39 | + report_title: str = "" | ||
| 40 | + sections: List[ParsedSection] = field(default_factory=list) | ||
| 41 | + | ||
| 42 | + | ||
| 43 | +class StateParser: | ||
| 44 | + """ | ||
| 45 | + State JSON 解析器 | ||
| 46 | + | ||
| 47 | + 解析三引擎的 State JSON,提取用于构建知识图谱的结构化数据。 | ||
| 48 | + """ | ||
| 49 | + | ||
| 50 | + def parse(self, engine_name: str, state_json: Dict[str, Any]) -> ParsedState: | ||
| 51 | + """ | ||
| 52 | + 解析单个引擎的 State JSON | ||
| 53 | + | ||
| 54 | + Args: | ||
| 55 | + engine_name: 引擎名称 (insight/media/query) | ||
| 56 | + state_json: State JSON 字典 | ||
| 57 | + | ||
| 58 | + Returns: | ||
| 59 | + ParsedState 对象 | ||
| 60 | + """ | ||
| 61 | + return ParsedState( | ||
| 62 | + engine=engine_name, | ||
| 63 | + query=state_json.get('query', ''), | ||
| 64 | + report_title=state_json.get('report_title', ''), | ||
| 65 | + sections=[ | ||
| 66 | + self._parse_paragraph(p) | ||
| 67 | + for p in state_json.get('paragraphs', []) | ||
| 68 | + ] | ||
| 69 | + ) | ||
| 70 | + | ||
| 71 | + def _parse_paragraph(self, para: Dict[str, Any]) -> ParsedSection: | ||
| 72 | + """解析单个段落""" | ||
| 73 | + research = para.get('research', {}) | ||
| 74 | + | ||
| 75 | + # 提取搜索历史 | ||
| 76 | + search_history = [] | ||
| 77 | + for search in research.get('search_history', []): | ||
| 78 | + search_history.append(SearchRecord( | ||
| 79 | + query=search.get('query', ''), | ||
| 80 | + url=search.get('url', ''), | ||
| 81 | + title=search.get('title', ''), | ||
| 82 | + content=search.get('content', '')[:200] if search.get('content') else '', | ||
| 83 | + score=search.get('score'), | ||
| 84 | + timestamp=search.get('timestamp', '') | ||
| 85 | + )) | ||
| 86 | + | ||
| 87 | + # 获取摘要,优先使用 latest_summary | ||
| 88 | + summary = research.get('latest_summary', '') | ||
| 89 | + if not summary: | ||
| 90 | + summary = para.get('content', '') | ||
| 91 | + | ||
| 92 | + return ParsedSection( | ||
| 93 | + title=para.get('title', ''), | ||
| 94 | + order=para.get('order', 0), | ||
| 95 | + summary=summary[:300] if summary else '', | ||
| 96 | + search_history=search_history | ||
| 97 | + ) | ||
| 98 | + | ||
| 99 | + def parse_from_file(self, engine_name: str, file_path: str) -> Optional[ParsedState]: | ||
| 100 | + """ | ||
| 101 | + 从文件解析 State JSON | ||
| 102 | + | ||
| 103 | + Args: | ||
| 104 | + engine_name: 引擎名称 | ||
| 105 | + file_path: JSON 文件路径 | ||
| 106 | + | ||
| 107 | + Returns: | ||
| 108 | + ParsedState 对象,失败返回 None | ||
| 109 | + """ | ||
| 110 | + try: | ||
| 111 | + path = Path(file_path) | ||
| 112 | + if not path.exists(): | ||
| 113 | + return None | ||
| 114 | + | ||
| 115 | + with open(path, 'r', encoding='utf-8') as f: | ||
| 116 | + state_json = json.load(f) | ||
| 117 | + | ||
| 118 | + return self.parse(engine_name, state_json) | ||
| 119 | + except Exception: | ||
| 120 | + return None | ||
| 121 | + | ||
| 122 | + def find_state_json(self, md_path: str) -> Optional[str]: | ||
| 123 | + """ | ||
| 124 | + 根据 Markdown 报告路径查找对应的 State JSON 文件 | ||
| 125 | + | ||
| 126 | + State JSON 通常与 MD 文件在同一目录下,命名格式为 state_*.json | ||
| 127 | + | ||
| 128 | + Args: | ||
| 129 | + md_path: Markdown 文件路径 | ||
| 130 | + | ||
| 131 | + Returns: | ||
| 132 | + State JSON 路径,未找到返回 None | ||
| 133 | + """ | ||
| 134 | + md_file = Path(md_path) | ||
| 135 | + if not md_file.exists(): | ||
| 136 | + return None | ||
| 137 | + | ||
| 138 | + parent_dir = md_file.parent | ||
| 139 | + | ||
| 140 | + # 尝试匹配 state_*.json 文件 | ||
| 141 | + state_files = list(parent_dir.glob('state_*.json')) | ||
| 142 | + | ||
| 143 | + if not state_files: | ||
| 144 | + return None | ||
| 145 | + | ||
| 146 | + # 如果有多个,尝试通过时间戳匹配 | ||
| 147 | + md_stem = md_file.stem # e.g., "武汉大学_20250825_180214" | ||
| 148 | + | ||
| 149 | + for state_file in state_files: | ||
| 150 | + state_stem = state_file.stem # e.g., "state_武汉大学_20250825_180214" | ||
| 151 | + # 检查是否包含相同的查询词和时间戳 | ||
| 152 | + if md_stem in state_stem or state_stem.replace('state_', '') == md_stem: | ||
| 153 | + return str(state_file) | ||
| 154 | + | ||
| 155 | + # 否则返回最新的 | ||
| 156 | + state_files.sort(key=lambda f: f.stat().st_mtime, reverse=True) | ||
| 157 | + return str(state_files[0]) |
| @@ -14,6 +14,7 @@ from .chapter_generation_node import ( | @@ -14,6 +14,7 @@ from .chapter_generation_node import ( | ||
| 14 | ) | 14 | ) |
| 15 | from .document_layout_node import DocumentLayoutNode | 15 | from .document_layout_node import DocumentLayoutNode |
| 16 | from .word_budget_node import WordBudgetNode | 16 | from .word_budget_node import WordBudgetNode |
| 17 | +from .graphrag_query_node import GraphRAGQueryNode, QueryHistory | ||
| 17 | 18 | ||
| 18 | __all__ = [ | 19 | __all__ = [ |
| 19 | "BaseNode", | 20 | "BaseNode", |
| @@ -25,4 +26,6 @@ __all__ = [ | @@ -25,4 +26,6 @@ __all__ = [ | ||
| 25 | "ChapterValidationError", | 26 | "ChapterValidationError", |
| 26 | "DocumentLayoutNode", | 27 | "DocumentLayoutNode", |
| 27 | "WordBudgetNode", | 28 | "WordBudgetNode", |
| 29 | + "GraphRAGQueryNode", | ||
| 30 | + "QueryHistory", | ||
| 28 | ] | 31 | ] |
| @@ -205,11 +205,15 @@ class ChapterGenerationNode(BaseNode): | @@ -205,11 +205,15 @@ class ChapterGenerationNode(BaseNode): | ||
| 205 | llm_payload = self._build_payload(section, context) | 205 | llm_payload = self._build_payload(section, context) |
| 206 | user_message = build_chapter_user_prompt(llm_payload) | 206 | user_message = build_chapter_user_prompt(llm_payload) |
| 207 | 207 | ||
| 208 | + # 检查是否有GraphRAG结果,决定是否使用增强提示词 | ||
| 209 | + graph_enhanced = bool(context.get("graph_results")) | ||
| 210 | + | ||
| 208 | raw_text = self._stream_llm( | 211 | raw_text = self._stream_llm( |
| 209 | user_message, | 212 | user_message, |
| 210 | chapter_dir, | 213 | chapter_dir, |
| 211 | stream_callback=stream_callback, | 214 | stream_callback=stream_callback, |
| 212 | section_meta=chapter_meta, | 215 | section_meta=chapter_meta, |
| 216 | + graph_enhanced=graph_enhanced, | ||
| 213 | **kwargs, | 217 | **kwargs, |
| 214 | ) | 218 | ) |
| 215 | parse_context: List[str] = [] | 219 | parse_context: List[str] = [] |
| @@ -351,6 +355,22 @@ class ChapterGenerationNode(BaseNode): | @@ -351,6 +355,22 @@ class ChapterGenerationNode(BaseNode): | ||
| 351 | "chapterPlan": chapter_plan, | 355 | "chapterPlan": chapter_plan, |
| 352 | "wordPlan": context.get("word_plan"), | 356 | "wordPlan": context.get("word_plan"), |
| 353 | } | 357 | } |
| 358 | + | ||
| 359 | + # GraphRAG 增强:如果上下文中包含图谱查询结果,添加到payload | ||
| 360 | + graph_results = context.get("graph_results") | ||
| 361 | + if graph_results: | ||
| 362 | + payload["graphResults"] = { | ||
| 363 | + "totalNodes": graph_results.get("total_nodes", 0), | ||
| 364 | + "queryRounds": graph_results.get("query_rounds", 0), | ||
| 365 | + "matchedSections": graph_results.get("matched_sections", []), | ||
| 366 | + "matchedQueries": graph_results.get("matched_queries", []), | ||
| 367 | + "matchedSources": graph_results.get("matched_sources", []), | ||
| 368 | + } | ||
| 369 | + # 同时添加增强提示(如果有) | ||
| 370 | + graph_enhancement = context.get("graph_enhancement_prompt") | ||
| 371 | + if graph_enhancement: | ||
| 372 | + payload["graphEnhancementPrompt"] = graph_enhancement | ||
| 373 | + | ||
| 354 | if chapter_plan: | 374 | if chapter_plan: |
| 355 | constraints = payload["constraints"] | 375 | constraints = payload["constraints"] |
| 356 | if chapter_plan.get("targetWords"): | 376 | if chapter_plan.get("targetWords"): |
| @@ -438,6 +458,7 @@ class ChapterGenerationNode(BaseNode): | @@ -438,6 +458,7 @@ class ChapterGenerationNode(BaseNode): | ||
| 438 | chapter_dir: Path, | 458 | chapter_dir: Path, |
| 439 | stream_callback: Optional[Callable[[str, Dict[str, Any]], None]] = None, | 459 | stream_callback: Optional[Callable[[str, Dict[str, Any]], None]] = None, |
| 440 | section_meta: Optional[Dict[str, Any]] = None, | 460 | section_meta: Optional[Dict[str, Any]] = None, |
| 461 | + graph_enhanced: bool = False, | ||
| 441 | **kwargs, | 462 | **kwargs, |
| 442 | ) -> str: | 463 | ) -> str: |
| 443 | """ | 464 | """ |
| @@ -448,15 +469,23 @@ class ChapterGenerationNode(BaseNode): | @@ -448,15 +469,23 @@ class ChapterGenerationNode(BaseNode): | ||
| 448 | chapter_dir: 章节的本地缓存目录,用于存放 stream.raw。 | 469 | chapter_dir: 章节的本地缓存目录,用于存放 stream.raw。 |
| 449 | stream_callback: SSE流式推送的回调函数。 | 470 | stream_callback: SSE流式推送的回调函数。 |
| 450 | section_meta: 附带的章节ID/标题,用于回调payload。 | 471 | section_meta: 附带的章节ID/标题,用于回调payload。 |
| 472 | + graph_enhanced: 是否启用GraphRAG增强的系统提示词。 | ||
| 451 | **kwargs: 透传温度、top_p等参数。 | 473 | **kwargs: 透传温度、top_p等参数。 |
| 452 | 474 | ||
| 453 | 返回: | 475 | 返回: |
| 454 | str: 将所有delta拼接后的原始文本。 | 476 | str: 将所有delta拼接后的原始文本。 |
| 455 | """ | 477 | """ |
| 478 | + # 根据是否启用GraphRAG选择不同的系统提示词 | ||
| 479 | + if graph_enhanced: | ||
| 480 | + from ..graphrag.prompts import SYSTEM_PROMPT_CHAPTER_GRAPH_ENHANCEMENT | ||
| 481 | + system_prompt = SYSTEM_PROMPT_CHAPTER_JSON + "\n\n" + SYSTEM_PROMPT_CHAPTER_GRAPH_ENHANCEMENT | ||
| 482 | + else: | ||
| 483 | + system_prompt = SYSTEM_PROMPT_CHAPTER_JSON | ||
| 484 | + | ||
| 456 | chunks: List[str] = [] | 485 | chunks: List[str] = [] |
| 457 | with self.storage.capture_stream(chapter_dir) as stream_fp: | 486 | with self.storage.capture_stream(chapter_dir) as stream_fp: |
| 458 | stream = self.llm_client.stream_invoke( | 487 | stream = self.llm_client.stream_invoke( |
| 459 | - SYSTEM_PROMPT_CHAPTER_JSON, | 488 | + system_prompt, |
| 460 | user_message, | 489 | user_message, |
| 461 | temperature=kwargs.get("temperature", 0.2), | 490 | temperature=kwargs.get("temperature", 0.2), |
| 462 | top_p=kwargs.get("top_p", 0.95), | 491 | top_p=kwargs.get("top_p", 0.95), |
ReportEngine/nodes/graphrag_query_node.py
0 → 100644
| 1 | +""" | ||
| 2 | +GraphRAG 查询节点 | ||
| 3 | + | ||
| 4 | +负责与知识图谱交互,让 LLM 决定查询参数并执行多轮查询。 | ||
| 5 | +包含查询历史机制以防止重复查询。 | ||
| 6 | +""" | ||
| 7 | + | ||
| 8 | +import json | ||
| 9 | +import re | ||
| 10 | +from dataclasses import dataclass, field | ||
| 11 | +from typing import Dict, Any, List, Optional | ||
| 12 | + | ||
| 13 | +from loguru import logger | ||
| 14 | + | ||
| 15 | +from .base_node import BaseNode | ||
| 16 | +from ..llms.base import LLMClient | ||
| 17 | +from ..graphrag.graph_storage import Graph | ||
| 18 | +from ..graphrag.query_engine import QueryEngine, QueryParams, QueryResult | ||
| 19 | +from ..graphrag.prompts import ( | ||
| 20 | + GRAPHRAG_QUERY_DECISION_SYSTEM, | ||
| 21 | + GRAPHRAG_QUERY_DECISION_USER | ||
| 22 | +) | ||
| 23 | + | ||
| 24 | + | ||
| 25 | +@dataclass | ||
| 26 | +class QueryRound: | ||
| 27 | + """单轮查询记录""" | ||
| 28 | + round: int | ||
| 29 | + params: Dict[str, Any] | ||
| 30 | + result_count: int | ||
| 31 | + summary: str | ||
| 32 | + | ||
| 33 | + | ||
| 34 | +class QueryHistory: | ||
| 35 | + """ | ||
| 36 | + 查询历史记录器 | ||
| 37 | + | ||
| 38 | + 记录每次查询的参数和结果摘要,用于防止 LLM 重复查询相同内容。 | ||
| 39 | + """ | ||
| 40 | + | ||
| 41 | + def __init__(self): | ||
| 42 | + self.rounds: List[QueryRound] = [] | ||
| 43 | + | ||
| 44 | + def add(self, params: Dict[str, Any], result: QueryResult) -> None: | ||
| 45 | + """ | ||
| 46 | + 记录一次查询 | ||
| 47 | + | ||
| 48 | + Args: | ||
| 49 | + params: 查询参数 | ||
| 50 | + result: 查询结果 | ||
| 51 | + """ | ||
| 52 | + self.rounds.append(QueryRound( | ||
| 53 | + round=len(self.rounds) + 1, | ||
| 54 | + params=params, | ||
| 55 | + result_count=result.total_nodes, | ||
| 56 | + summary=result.get_summary() | ||
| 57 | + )) | ||
| 58 | + | ||
| 59 | + def to_prompt(self) -> str: | ||
| 60 | + """ | ||
| 61 | + 生成供 LLM 参考的历史上下文 | ||
| 62 | + | ||
| 63 | + Returns: | ||
| 64 | + 格式化的历史记录字符串 | ||
| 65 | + """ | ||
| 66 | + if not self.rounds: | ||
| 67 | + return "(这是第1次查询,无历史记录)" | ||
| 68 | + | ||
| 69 | + lines = ["=== 已完成的查询历史 ==="] | ||
| 70 | + for r in self.rounds: | ||
| 71 | + keywords = r.params.get('keywords', []) | ||
| 72 | + node_types = r.params.get('node_types', ['all']) | ||
| 73 | + engine_filter = r.params.get('engine_filter', ['all']) | ||
| 74 | + | ||
| 75 | + lines.append(f"第{r.round}次查询:") | ||
| 76 | + lines.append(f" 关键词: {', '.join(keywords) if keywords else '无'}") | ||
| 77 | + lines.append(f" 节点类型: {', '.join(node_types) if node_types else '全部'}") | ||
| 78 | + lines.append(f" 引擎筛选: {', '.join(engine_filter) if engine_filter else '全部'}") | ||
| 79 | + lines.append(f" 返回节点数: {r.result_count}") | ||
| 80 | + lines.append(f" 结果摘要: {r.summary}") | ||
| 81 | + lines.append("") | ||
| 82 | + | ||
| 83 | + lines.append("=== 请避免重复上述查询,探索新的角度 ===") | ||
| 84 | + return "\n".join(lines) | ||
| 85 | + | ||
| 86 | + def get_all_keywords(self) -> List[str]: | ||
| 87 | + """获取所有已查询的关键词""" | ||
| 88 | + keywords = [] | ||
| 89 | + for r in self.rounds: | ||
| 90 | + keywords.extend(r.params.get('keywords', [])) | ||
| 91 | + return keywords | ||
| 92 | + | ||
| 93 | + | ||
| 94 | +class GraphRAGQueryNode(BaseNode): | ||
| 95 | + """ | ||
| 96 | + GraphRAG 查询节点 | ||
| 97 | + | ||
| 98 | + 核心职责: | ||
| 99 | + 1. 接收完整上下文(报告、章节规划、图谱概览) | ||
| 100 | + 2. 维护查询历史记录,防止重复查询 | ||
| 101 | + 3. 调用 LLM 决定查询参数 | ||
| 102 | + 4. 执行 GraphRAG 查询 | ||
| 103 | + 5. 最多允许 max_queries 次查询 | ||
| 104 | + 6. 将查询结果整合返回 | ||
| 105 | + """ | ||
| 106 | + | ||
| 107 | + def __init__(self, llm_client: LLMClient): | ||
| 108 | + super().__init__(llm_client, "GraphRAGQueryNode") | ||
| 109 | + | ||
| 110 | + def run(self, section: Dict[str, Any], context: Dict[str, Any], | ||
| 111 | + graph: Graph, max_queries: int = 3) -> Dict[str, Any]: | ||
| 112 | + """ | ||
| 113 | + 执行 GraphRAG 查询流程 | ||
| 114 | + | ||
| 115 | + Args: | ||
| 116 | + section: 当前章节信息 | ||
| 117 | + context: 生成上下文(报告、规划等) | ||
| 118 | + graph: 知识图谱 | ||
| 119 | + max_queries: 最大查询次数 | ||
| 120 | + | ||
| 121 | + Returns: | ||
| 122 | + 合并后的查询结果 | ||
| 123 | + """ | ||
| 124 | + self.log_info(f"开始 GraphRAG 查询,章节: {section.get('title', 'unknown')}") | ||
| 125 | + | ||
| 126 | + query_engine = QueryEngine(graph) | ||
| 127 | + history = QueryHistory() | ||
| 128 | + all_results: List[QueryResult] = [] | ||
| 129 | + | ||
| 130 | + for round_idx in range(max_queries): | ||
| 131 | + self.log_info(f"查询轮次 {round_idx + 1}/{max_queries}") | ||
| 132 | + | ||
| 133 | + # 1. 构建决策提示词 | ||
| 134 | + prompt = self._build_decision_prompt( | ||
| 135 | + section, context, query_engine, history | ||
| 136 | + ) | ||
| 137 | + | ||
| 138 | + # 2. 调用 LLM 决定查询参数 | ||
| 139 | + decision = self._get_query_decision(prompt) | ||
| 140 | + | ||
| 141 | + if decision is None: | ||
| 142 | + self.log_error("LLM 返回无效决策,终止查询") | ||
| 143 | + break | ||
| 144 | + | ||
| 145 | + # 3. 检查是否停止 | ||
| 146 | + if not decision.get('should_query', False): | ||
| 147 | + self.log_info(f"LLM 决定停止查询: {decision.get('reasoning', '无原因')}") | ||
| 148 | + break | ||
| 149 | + | ||
| 150 | + # 4. 执行查询 | ||
| 151 | + params = QueryParams( | ||
| 152 | + keywords=decision.get('keywords', []), | ||
| 153 | + node_types=decision.get('node_types'), | ||
| 154 | + engine_filter=decision.get('engine_filter'), | ||
| 155 | + depth=decision.get('depth', 1) | ||
| 156 | + ) | ||
| 157 | + | ||
| 158 | + result = query_engine.query(params) | ||
| 159 | + all_results.append(result) | ||
| 160 | + | ||
| 161 | + self.log_info(f"查询返回 {result.total_nodes} 个节点") | ||
| 162 | + | ||
| 163 | + # 5. 记录历史 | ||
| 164 | + history.add(decision, result) | ||
| 165 | + | ||
| 166 | + # 6. 合并所有结果 | ||
| 167 | + merged = self._merge_results(all_results) | ||
| 168 | + merged['query_rounds'] = len(all_results) | ||
| 169 | + | ||
| 170 | + self.log_info(f"GraphRAG 查询完成,共 {len(all_results)} 轮," | ||
| 171 | + f"获取 {merged.get('total_nodes', 0)} 个节点") | ||
| 172 | + | ||
| 173 | + return merged | ||
| 174 | + | ||
| 175 | + def _build_decision_prompt(self, section: Dict[str, Any], | ||
| 176 | + context: Dict[str, Any], | ||
| 177 | + query_engine: QueryEngine, | ||
| 178 | + history: QueryHistory) -> Dict[str, str]: | ||
| 179 | + """构建查询决策提示词""" | ||
| 180 | + # 获取图谱概览 | ||
| 181 | + summary = query_engine.get_node_summary() | ||
| 182 | + stats = summary.get('stats', {}) | ||
| 183 | + | ||
| 184 | + # 获取段落标题(按引擎分组) | ||
| 185 | + section_titles = query_engine.get_section_titles_by_engine() | ||
| 186 | + section_titles_text = "" | ||
| 187 | + for engine, titles in section_titles.items(): | ||
| 188 | + section_titles_text += f"\n{engine}: {', '.join(titles[:5])}" | ||
| 189 | + | ||
| 190 | + # 获取搜索词样例 | ||
| 191 | + sample_queries = query_engine.get_sample_search_queries(20) | ||
| 192 | + | ||
| 193 | + # 获取章节概览 | ||
| 194 | + chapters = context.get('chapters', []) | ||
| 195 | + chapters_text = "\n".join([ | ||
| 196 | + f"- {c.get('id', '')}: {c.get('title', '')}" | ||
| 197 | + for c in chapters[:10] | ||
| 198 | + ]) | ||
| 199 | + | ||
| 200 | + user_prompt = GRAPHRAG_QUERY_DECISION_USER.format( | ||
| 201 | + chapter_title=section.get('title', ''), | ||
| 202 | + chapter_id=section.get('id', ''), | ||
| 203 | + chapter_role=section.get('role', ''), | ||
| 204 | + target_words=section.get('target_words', 500), | ||
| 205 | + chapter_emphasis=section.get('emphasis', ''), | ||
| 206 | + report_topic=context.get('query', ''), | ||
| 207 | + template_name=context.get('template_name', ''), | ||
| 208 | + chapters_overview=chapters_text, | ||
| 209 | + topic_name=summary.get('topic', ''), | ||
| 210 | + engine_count=len(summary.get('engines', [])), | ||
| 211 | + section_count=stats.get('section', 0), | ||
| 212 | + query_count=stats.get('search_query', 0), | ||
| 213 | + source_count=stats.get('source', 0), | ||
| 214 | + section_titles_by_engine=section_titles_text, | ||
| 215 | + sample_search_queries=', '.join(sample_queries), | ||
| 216 | + query_history_detail=history.to_prompt() | ||
| 217 | + ) | ||
| 218 | + | ||
| 219 | + return { | ||
| 220 | + 'system': GRAPHRAG_QUERY_DECISION_SYSTEM, | ||
| 221 | + 'user': user_prompt | ||
| 222 | + } | ||
| 223 | + | ||
| 224 | + def _get_query_decision(self, prompt: Dict[str, str]) -> Optional[Dict[str, Any]]: | ||
| 225 | + """调用 LLM 获取查询决策""" | ||
| 226 | + try: | ||
| 227 | + response = self.llm_client.invoke( | ||
| 228 | + system=prompt['system'], | ||
| 229 | + user=prompt['user'] | ||
| 230 | + ) | ||
| 231 | + | ||
| 232 | + # 解析 JSON 响应 | ||
| 233 | + return self._parse_json_response(response) | ||
| 234 | + except Exception as e: | ||
| 235 | + self.log_error(f"LLM 调用失败: {e}") | ||
| 236 | + return None | ||
| 237 | + | ||
| 238 | + def _parse_json_response(self, response: str) -> Optional[Dict[str, Any]]: | ||
| 239 | + """解析 LLM 返回的 JSON""" | ||
| 240 | + try: | ||
| 241 | + # 尝试直接解析 | ||
| 242 | + return json.loads(response) | ||
| 243 | + except json.JSONDecodeError: | ||
| 244 | + pass | ||
| 245 | + | ||
| 246 | + # 尝试提取 JSON 块 | ||
| 247 | + json_match = re.search(r'```json\s*(.*?)\s*```', response, re.DOTALL) | ||
| 248 | + if json_match: | ||
| 249 | + try: | ||
| 250 | + return json.loads(json_match.group(1)) | ||
| 251 | + except json.JSONDecodeError: | ||
| 252 | + pass | ||
| 253 | + | ||
| 254 | + # 尝试提取花括号内容 | ||
| 255 | + brace_match = re.search(r'\{.*\}', response, re.DOTALL) | ||
| 256 | + if brace_match: | ||
| 257 | + try: | ||
| 258 | + return json.loads(brace_match.group()) | ||
| 259 | + except json.JSONDecodeError: | ||
| 260 | + pass | ||
| 261 | + | ||
| 262 | + self.log_error(f"无法解析 JSON 响应: {response[:200]}") | ||
| 263 | + return None | ||
| 264 | + | ||
| 265 | + def _merge_results(self, results: List[QueryResult]) -> Dict[str, Any]: | ||
| 266 | + """合并多轮查询结果""" | ||
| 267 | + merged = { | ||
| 268 | + 'matched_sections': [], | ||
| 269 | + 'matched_queries': [], | ||
| 270 | + 'matched_sources': [], | ||
| 271 | + 'total_nodes': 0, | ||
| 272 | + 'cross_engine_insights': [] | ||
| 273 | + } | ||
| 274 | + | ||
| 275 | + seen_section_ids = set() | ||
| 276 | + seen_query_ids = set() | ||
| 277 | + seen_source_ids = set() | ||
| 278 | + | ||
| 279 | + for result in results: | ||
| 280 | + # 合并段落(去重) | ||
| 281 | + for section in result.matched_sections: | ||
| 282 | + sid = section.get('id') | ||
| 283 | + if sid and sid not in seen_section_ids: | ||
| 284 | + seen_section_ids.add(sid) | ||
| 285 | + merged['matched_sections'].append(section) | ||
| 286 | + | ||
| 287 | + # 合并搜索词(去重) | ||
| 288 | + for query in result.matched_queries: | ||
| 289 | + qid = query.get('id') | ||
| 290 | + if qid and qid not in seen_query_ids: | ||
| 291 | + seen_query_ids.add(qid) | ||
| 292 | + merged['matched_queries'].append(query) | ||
| 293 | + | ||
| 294 | + # 合并来源(去重) | ||
| 295 | + for source in result.matched_sources: | ||
| 296 | + sid = source.get('id') | ||
| 297 | + if sid and sid not in seen_source_ids: | ||
| 298 | + seen_source_ids.add(sid) | ||
| 299 | + merged['matched_sources'].append(source) | ||
| 300 | + | ||
| 301 | + merged['total_nodes'] = ( | ||
| 302 | + len(merged['matched_sections']) + | ||
| 303 | + len(merged['matched_queries']) + | ||
| 304 | + len(merged['matched_sources']) | ||
| 305 | + ) | ||
| 306 | + | ||
| 307 | + # 生成跨引擎洞察 | ||
| 308 | + merged['cross_engine_insights'] = self._generate_cross_engine_insights(merged) | ||
| 309 | + | ||
| 310 | + return merged | ||
| 311 | + | ||
| 312 | + def _generate_cross_engine_insights(self, merged: Dict[str, Any]) -> List[str]: | ||
| 313 | + """生成跨引擎关联洞察""" | ||
| 314 | + insights = [] | ||
| 315 | + | ||
| 316 | + # 统计各引擎的段落数 | ||
| 317 | + engine_sections = {} | ||
| 318 | + for section in merged['matched_sections']: | ||
| 319 | + engine = section.get('engine', 'unknown') | ||
| 320 | + engine_sections[engine] = engine_sections.get(engine, 0) + 1 | ||
| 321 | + | ||
| 322 | + if len(engine_sections) > 1: | ||
| 323 | + engines = list(engine_sections.keys()) | ||
| 324 | + insights.append(f"跨引擎信息来源: {', '.join(engines)}") | ||
| 325 | + | ||
| 326 | + # 统计搜索词的引擎分布 | ||
| 327 | + engine_queries = {} | ||
| 328 | + for query in merged['matched_queries']: | ||
| 329 | + engine = query.get('engine', 'unknown') | ||
| 330 | + if engine not in engine_queries: | ||
| 331 | + engine_queries[engine] = [] | ||
| 332 | + engine_queries[engine].append(query.get('query_text', '')) | ||
| 333 | + | ||
| 334 | + if len(engine_queries) > 1: | ||
| 335 | + insights.append(f"多引擎搜索视角: {len(engine_queries)} 个引擎提供了相关搜索") | ||
| 336 | + | ||
| 337 | + return insights |
| @@ -512,3 +512,129 @@ def build_document_layout_prompt(payload: dict) -> str: | @@ -512,3 +512,129 @@ def build_document_layout_prompt(payload: dict) -> str: | ||
| 512 | def build_word_budget_prompt(payload: dict) -> str: | 512 | def build_word_budget_prompt(payload: dict) -> str: |
| 513 | """将篇幅规划输入转为字符串,便于送入LLM并保持字段精确。""" | 513 | """将篇幅规划输入转为字符串,便于送入LLM并保持字段精确。""" |
| 514 | return json.dumps(payload, ensure_ascii=False, indent=2) | 514 | return json.dumps(payload, ensure_ascii=False, indent=2) |
| 515 | + | ||
| 516 | + | ||
| 517 | +# ==================== GraphRAG 增强提示词 ==================== | ||
| 518 | + | ||
| 519 | +GRAPHRAG_CHAPTER_ENHANCEMENT_INTRO = """ | ||
| 520 | +<知识图谱查询结果> | ||
| 521 | +以下是针对本章节从知识图谱中查询到的相关信息,这些信息来自对Insight/Media/Query三个分析引擎结构化数据的聚合: | ||
| 522 | + | ||
| 523 | +{graph_results} | ||
| 524 | + | ||
| 525 | +请在生成本章内容时: | ||
| 526 | +1. 充分利用上述图谱查询结果中的具体数据点、关键发现和关联关系 | ||
| 527 | +2. 优先引用图谱中标注的来源(搜索关键词、数据来源等) | ||
| 528 | +3. 当图谱结果与三引擎报告有重叠时,以图谱中的结构化数据为准 | ||
| 529 | +4. 注意图谱中节点之间的关联关系,体现因果或递进逻辑 | ||
| 530 | +5. 如果图谱结果中有明确的数值或时间点,务必准确引用 | ||
| 531 | +</知识图谱查询结果> | ||
| 532 | +""" | ||
| 533 | + | ||
| 534 | + | ||
| 535 | +def build_graphrag_enhanced_user_prompt(payload: dict) -> str: | ||
| 536 | + """ | ||
| 537 | + 构造包含GraphRAG查询结果的章节用户提示词。 | ||
| 538 | + | ||
| 539 | + 当GraphRAG启用且有查询结果时,在标准payload基础上 | ||
| 540 | + 注入图谱查询摘要,指导LLM在章节生成时优先利用这些信息。 | ||
| 541 | + | ||
| 542 | + Args: | ||
| 543 | + payload: 包含标准章节上下文和可选 graph_enhancement_prompt 的字典 | ||
| 544 | + | ||
| 545 | + Returns: | ||
| 546 | + 序列化后的用户提示词字符串 | ||
| 547 | + """ | ||
| 548 | + # 提取图谱增强内容(如果有) | ||
| 549 | + graph_prompt = payload.pop('graph_enhancement_prompt', None) | ||
| 550 | + | ||
| 551 | + base_prompt = json.dumps(payload, ensure_ascii=False, indent=2) | ||
| 552 | + | ||
| 553 | + if graph_prompt: | ||
| 554 | + return f"{base_prompt}\n\n{graph_prompt}" | ||
| 555 | + | ||
| 556 | + return base_prompt | ||
| 557 | + | ||
| 558 | + | ||
| 559 | +def format_graph_nodes_for_prompt(nodes: list) -> str: | ||
| 560 | + """ | ||
| 561 | + 将图谱节点列表格式化为提示词友好的文本。 | ||
| 562 | + | ||
| 563 | + Args: | ||
| 564 | + nodes: 节点数据列表,每个节点包含 id, type, label, properties | ||
| 565 | + | ||
| 566 | + Returns: | ||
| 567 | + 格式化的节点描述文本 | ||
| 568 | + """ | ||
| 569 | + if not nodes: | ||
| 570 | + return "(无相关节点)" | ||
| 571 | + | ||
| 572 | + lines = [] | ||
| 573 | + # 按类型分组 | ||
| 574 | + by_type = {} | ||
| 575 | + for node in nodes: | ||
| 576 | + node_type = node.get('type', 'unknown') | ||
| 577 | + if node_type not in by_type: | ||
| 578 | + by_type[node_type] = [] | ||
| 579 | + by_type[node_type].append(node) | ||
| 580 | + | ||
| 581 | + type_labels = { | ||
| 582 | + 'topic': '主题', | ||
| 583 | + 'engine': '分析引擎', | ||
| 584 | + 'section': '报告段落', | ||
| 585 | + 'search_query': '搜索关键词', | ||
| 586 | + 'source': '数据来源' | ||
| 587 | + } | ||
| 588 | + | ||
| 589 | + for node_type, type_nodes in by_type.items(): | ||
| 590 | + type_label = type_labels.get(node_type, node_type) | ||
| 591 | + lines.append(f"\n【{type_label}】") | ||
| 592 | + for n in type_nodes[:10]: # 每类最多10个 | ||
| 593 | + label = n.get('label', n.get('id', '')) | ||
| 594 | + props = n.get('properties', {}) | ||
| 595 | + prop_str = '' | ||
| 596 | + if props: | ||
| 597 | + key_props = {k: v for k, v in props.items() if k in ['summary', 'content', 'headline', 'url', 'query', 'source']} | ||
| 598 | + if key_props: | ||
| 599 | + prop_str = ' | ' + ', '.join(f"{k}:{str(v)[:100]}" for k, v in key_props.items()) | ||
| 600 | + lines.append(f" • {label}{prop_str}") | ||
| 601 | + | ||
| 602 | + return '\n'.join(lines) | ||
| 603 | + | ||
| 604 | + | ||
| 605 | +def format_graph_edges_for_prompt(edges: list) -> str: | ||
| 606 | + """ | ||
| 607 | + 将图谱边列表格式化为提示词友好的文本。 | ||
| 608 | + | ||
| 609 | + Args: | ||
| 610 | + edges: 边数据列表,每条边包含 source, target, relation | ||
| 611 | + | ||
| 612 | + Returns: | ||
| 613 | + 格式化的关系描述文本 | ||
| 614 | + """ | ||
| 615 | + if not edges: | ||
| 616 | + return "(无关联关系)" | ||
| 617 | + | ||
| 618 | + relation_labels = { | ||
| 619 | + 'analyzed_by': '被分析于', | ||
| 620 | + 'contains': '包含', | ||
| 621 | + 'searched': '搜索了', | ||
| 622 | + 'found': '发现于' | ||
| 623 | + } | ||
| 624 | + | ||
| 625 | + lines = [] | ||
| 626 | + seen = set() | ||
| 627 | + for edge in edges[:20]: # 最多20条关系 | ||
| 628 | + source = edge.get('source', '') | ||
| 629 | + target = edge.get('target', '') | ||
| 630 | + relation = edge.get('relation', 'related') | ||
| 631 | + | ||
| 632 | + key = f"{source}-{relation}-{target}" | ||
| 633 | + if key in seen: | ||
| 634 | + continue | ||
| 635 | + seen.add(key) | ||
| 636 | + | ||
| 637 | + rel_label = relation_labels.get(relation, relation) | ||
| 638 | + lines.append(f" • {source} —[{rel_label}]→ {target}") | ||
| 639 | + | ||
| 640 | + return '\n'.join(lines) if lines else "(无关联关系)" |
| @@ -67,6 +67,14 @@ class Settings(BaseSettings): | @@ -67,6 +67,14 @@ class Settings(BaseSettings): | ||
| 67 | "logs/json_repair_failures", description="无法修复的JSON块落盘目录" | 67 | "logs/json_repair_failures", description="无法修复的JSON块落盘目录" |
| 68 | ) | 68 | ) |
| 69 | 69 | ||
| 70 | + # GraphRAG 配置 | ||
| 71 | + GRAPHRAG_ENABLED: bool = Field( | ||
| 72 | + default=False, description="是否启用GraphRAG知识图谱功能" | ||
| 73 | + ) | ||
| 74 | + GRAPHRAG_MAX_QUERIES: int = Field( | ||
| 75 | + default=3, description="GraphRAG每章节查询次数上限" | ||
| 76 | + ) | ||
| 77 | + | ||
| 70 | class Config: | 78 | class Config: |
| 71 | """Pydantic配置:允许从.env读取并兼容大小写""" | 79 | """Pydantic配置:允许从.env读取并兼容大小写""" |
| 72 | env_file = ".env" | 80 | env_file = ".env" |
| @@ -113,7 +113,9 @@ CONFIG_KEYS = [ | @@ -113,7 +113,9 @@ CONFIG_KEYS = [ | ||
| 113 | 'TAVILY_API_KEY', | 113 | 'TAVILY_API_KEY', |
| 114 | 'SEARCH_TOOL_TYPE', | 114 | 'SEARCH_TOOL_TYPE', |
| 115 | 'BOCHA_WEB_SEARCH_API_KEY', | 115 | 'BOCHA_WEB_SEARCH_API_KEY', |
| 116 | - 'ANSPIRE_API_KEY' | 116 | + 'ANSPIRE_API_KEY', |
| 117 | + 'GRAPHRAG_ENABLED', | ||
| 118 | + 'GRAPHRAG_MAX_QUERIES' | ||
| 117 | ] | 119 | ] |
| 118 | 120 | ||
| 119 | 121 | ||
| @@ -1295,6 +1297,247 @@ def shutdown_system(): | @@ -1295,6 +1297,247 @@ def shutdown_system(): | ||
| 1295 | logger.exception("系统关闭过程中出现异常") | 1297 | logger.exception("系统关闭过程中出现异常") |
| 1296 | return jsonify({'success': False, 'message': f'系统关闭异常: {exc}'}), 500 | 1298 | return jsonify({'success': False, 'message': f'系统关闭异常: {exc}'}), 500 |
| 1297 | 1299 | ||
| 1300 | +# ==================== GraphRAG API 端点 ==================== | ||
| 1301 | + | ||
| 1302 | +@app.route('/api/graph/<report_id>') | ||
| 1303 | +def get_graph_data(report_id): | ||
| 1304 | + """ | ||
| 1305 | + 获取指定报告的知识图谱数据。 | ||
| 1306 | + | ||
| 1307 | + 返回格式适合前端 Vis.js 渲染: | ||
| 1308 | + - nodes: [{id, label, group, title, properties}] | ||
| 1309 | + - edges: [{from, to, label}] | ||
| 1310 | + """ | ||
| 1311 | + try: | ||
| 1312 | + from ReportEngine.graphrag import GraphStorage, Graph | ||
| 1313 | + | ||
| 1314 | + # 从默认存储位置查找图谱文件 | ||
| 1315 | + storage = GraphStorage() | ||
| 1316 | + graph_path = storage.find_graph_by_report_id(report_id) | ||
| 1317 | + | ||
| 1318 | + if not graph_path or not graph_path.exists(): | ||
| 1319 | + return jsonify({ | ||
| 1320 | + 'success': False, | ||
| 1321 | + 'message': f'未找到报告 {report_id} 的知识图谱数据' | ||
| 1322 | + }), 404 | ||
| 1323 | + | ||
| 1324 | + graph = storage.load(graph_path) | ||
| 1325 | + | ||
| 1326 | + # 转换为 Vis.js 格式 | ||
| 1327 | + vis_nodes = [] | ||
| 1328 | + vis_edges = [] | ||
| 1329 | + | ||
| 1330 | + for node_id, node in graph.nodes.items(): | ||
| 1331 | + vis_nodes.append({ | ||
| 1332 | + 'id': node_id, | ||
| 1333 | + 'label': node.label or node_id, | ||
| 1334 | + 'group': node.type, | ||
| 1335 | + 'title': _format_node_tooltip(node), | ||
| 1336 | + 'properties': node.properties | ||
| 1337 | + }) | ||
| 1338 | + | ||
| 1339 | + for edge in graph.edges: | ||
| 1340 | + vis_edges.append({ | ||
| 1341 | + 'from': edge.source, | ||
| 1342 | + 'to': edge.target, | ||
| 1343 | + 'label': edge.relation, | ||
| 1344 | + 'arrows': 'to' | ||
| 1345 | + }) | ||
| 1346 | + | ||
| 1347 | + return jsonify({ | ||
| 1348 | + 'success': True, | ||
| 1349 | + 'graph': { | ||
| 1350 | + 'nodes': vis_nodes, | ||
| 1351 | + 'edges': vis_edges, | ||
| 1352 | + 'stats': graph.get_stats() | ||
| 1353 | + } | ||
| 1354 | + }) | ||
| 1355 | + | ||
| 1356 | + except Exception as e: | ||
| 1357 | + logger.exception(f"获取图谱数据失败: {e}") | ||
| 1358 | + return jsonify({ | ||
| 1359 | + 'success': False, | ||
| 1360 | + 'message': f'获取图谱数据失败: {str(e)}' | ||
| 1361 | + }), 500 | ||
| 1362 | + | ||
| 1363 | + | ||
| 1364 | +@app.route('/api/graph/latest') | ||
| 1365 | +def get_latest_graph(): | ||
| 1366 | + """获取最近一次生成的知识图谱数据。""" | ||
| 1367 | + try: | ||
| 1368 | + from ReportEngine.graphrag import GraphStorage | ||
| 1369 | + | ||
| 1370 | + storage = GraphStorage() | ||
| 1371 | + latest_path = storage.find_latest_graph() | ||
| 1372 | + | ||
| 1373 | + if not latest_path or not latest_path.exists(): | ||
| 1374 | + return jsonify({ | ||
| 1375 | + 'success': False, | ||
| 1376 | + 'message': '暂无可用的知识图谱数据' | ||
| 1377 | + }), 404 | ||
| 1378 | + | ||
| 1379 | + graph = storage.load(latest_path) | ||
| 1380 | + report_id = latest_path.parent.name if latest_path.parent else 'unknown' | ||
| 1381 | + | ||
| 1382 | + # 转换为 Vis.js 格式 | ||
| 1383 | + vis_nodes = [] | ||
| 1384 | + vis_edges = [] | ||
| 1385 | + | ||
| 1386 | + for node_id, node in graph.nodes.items(): | ||
| 1387 | + vis_nodes.append({ | ||
| 1388 | + 'id': node_id, | ||
| 1389 | + 'label': node.label or node_id, | ||
| 1390 | + 'group': node.type, | ||
| 1391 | + 'title': _format_node_tooltip(node), | ||
| 1392 | + 'properties': node.properties | ||
| 1393 | + }) | ||
| 1394 | + | ||
| 1395 | + for edge in graph.edges: | ||
| 1396 | + vis_edges.append({ | ||
| 1397 | + 'from': edge.source, | ||
| 1398 | + 'to': edge.target, | ||
| 1399 | + 'label': edge.relation, | ||
| 1400 | + 'arrows': 'to' | ||
| 1401 | + }) | ||
| 1402 | + | ||
| 1403 | + return jsonify({ | ||
| 1404 | + 'success': True, | ||
| 1405 | + 'report_id': report_id, | ||
| 1406 | + 'graph': { | ||
| 1407 | + 'nodes': vis_nodes, | ||
| 1408 | + 'edges': vis_edges, | ||
| 1409 | + 'stats': graph.get_stats() | ||
| 1410 | + } | ||
| 1411 | + }) | ||
| 1412 | + | ||
| 1413 | + except Exception as e: | ||
| 1414 | + logger.exception(f"获取最新图谱失败: {e}") | ||
| 1415 | + return jsonify({ | ||
| 1416 | + 'success': False, | ||
| 1417 | + 'message': f'获取最新图谱失败: {str(e)}' | ||
| 1418 | + }), 500 | ||
| 1419 | + | ||
| 1420 | + | ||
| 1421 | +@app.route('/graph-viewer') | ||
| 1422 | +@app.route('/graph-viewer/') | ||
| 1423 | +@app.route('/graph-viewer/<report_id>') | ||
| 1424 | +def graph_viewer(report_id=None): | ||
| 1425 | + """ | ||
| 1426 | + 知识图谱可视化页面。 | ||
| 1427 | + | ||
| 1428 | + 提供交互式图谱展示,支持: | ||
| 1429 | + - 全屏模式 | ||
| 1430 | + - 缩放、拖拽 | ||
| 1431 | + - 节点详情查看 | ||
| 1432 | + - 筛选和搜索 | ||
| 1433 | + """ | ||
| 1434 | + return render_template('graph_viewer.html', report_id=report_id) | ||
| 1435 | + | ||
| 1436 | + | ||
| 1437 | +@app.route('/api/graph/query', methods=['POST']) | ||
| 1438 | +def query_graph(): | ||
| 1439 | + """ | ||
| 1440 | + 查询知识图谱。 | ||
| 1441 | + | ||
| 1442 | + 请求体: | ||
| 1443 | + { | ||
| 1444 | + "report_id": "xxx", // 可选,默认使用最新图谱 | ||
| 1445 | + "keywords": ["关键词1", "关键词2"], | ||
| 1446 | + "node_types": ["section", "source"], | ||
| 1447 | + "depth": 2 | ||
| 1448 | + } | ||
| 1449 | + """ | ||
| 1450 | + try: | ||
| 1451 | + from ReportEngine.graphrag import GraphStorage, QueryEngine, QueryParams | ||
| 1452 | + | ||
| 1453 | + data = request.get_json() or {} | ||
| 1454 | + report_id = data.get('report_id') | ||
| 1455 | + | ||
| 1456 | + storage = GraphStorage() | ||
| 1457 | + | ||
| 1458 | + if report_id: | ||
| 1459 | + graph_path = storage.find_graph_by_report_id(report_id) | ||
| 1460 | + else: | ||
| 1461 | + graph_path = storage.find_latest_graph() | ||
| 1462 | + | ||
| 1463 | + if not graph_path or not graph_path.exists(): | ||
| 1464 | + return jsonify({ | ||
| 1465 | + 'success': False, | ||
| 1466 | + 'message': '未找到可用的知识图谱' | ||
| 1467 | + }), 404 | ||
| 1468 | + | ||
| 1469 | + graph = storage.load(graph_path) | ||
| 1470 | + query_engine = QueryEngine(graph) | ||
| 1471 | + | ||
| 1472 | + params = QueryParams( | ||
| 1473 | + keywords=data.get('keywords', []), | ||
| 1474 | + node_types=data.get('node_types'), | ||
| 1475 | + engine_filter=data.get('engine_filter'), | ||
| 1476 | + depth=data.get('depth', 1) | ||
| 1477 | + ) | ||
| 1478 | + | ||
| 1479 | + result = query_engine.query(params) | ||
| 1480 | + | ||
| 1481 | + return jsonify({ | ||
| 1482 | + 'success': True, | ||
| 1483 | + 'result': { | ||
| 1484 | + 'matched_nodes': [ | ||
| 1485 | + { | ||
| 1486 | + 'id': n.id, | ||
| 1487 | + 'type': n.type, | ||
| 1488 | + 'label': n.label, | ||
| 1489 | + 'properties': n.properties | ||
| 1490 | + } | ||
| 1491 | + for n in result.matched_nodes | ||
| 1492 | + ], | ||
| 1493 | + 'related_edges': [ | ||
| 1494 | + { | ||
| 1495 | + 'source': e.source, | ||
| 1496 | + 'target': e.target, | ||
| 1497 | + 'relation': e.relation | ||
| 1498 | + } | ||
| 1499 | + for e in result.related_edges | ||
| 1500 | + ], | ||
| 1501 | + 'expanded_nodes': [ | ||
| 1502 | + { | ||
| 1503 | + 'id': n.id, | ||
| 1504 | + 'type': n.type, | ||
| 1505 | + 'label': n.label, | ||
| 1506 | + 'properties': n.properties | ||
| 1507 | + } | ||
| 1508 | + for n in result.expanded_nodes | ||
| 1509 | + ] | ||
| 1510 | + } | ||
| 1511 | + }) | ||
| 1512 | + | ||
| 1513 | + except Exception as e: | ||
| 1514 | + logger.exception(f"图谱查询失败: {e}") | ||
| 1515 | + return jsonify({ | ||
| 1516 | + 'success': False, | ||
| 1517 | + 'message': f'图谱查询失败: {str(e)}' | ||
| 1518 | + }), 500 | ||
| 1519 | + | ||
| 1520 | + | ||
| 1521 | +def _format_node_tooltip(node) -> str: | ||
| 1522 | + """格式化节点悬停提示文本。""" | ||
| 1523 | + lines = [f"<b>{node.label or node.id}</b>"] | ||
| 1524 | + lines.append(f"类型: {node.type}") | ||
| 1525 | + | ||
| 1526 | + props = node.properties or {} | ||
| 1527 | + if 'summary' in props: | ||
| 1528 | + lines.append(f"摘要: {props['summary'][:100]}...") | ||
| 1529 | + if 'content' in props: | ||
| 1530 | + lines.append(f"内容: {props['content'][:80]}...") | ||
| 1531 | + if 'url' in props: | ||
| 1532 | + lines.append(f"链接: {props['url']}") | ||
| 1533 | + if 'query' in props: | ||
| 1534 | + lines.append(f"查询: {props['query']}") | ||
| 1535 | + | ||
| 1536 | + return "<br>".join(lines) | ||
| 1537 | + | ||
| 1538 | + | ||
| 1539 | +# ==================== GraphRAG API 端点结束 ==================== | ||
| 1540 | + | ||
| 1298 | @socketio.on('connect') | 1541 | @socketio.on('connect') |
| 1299 | def handle_connect(): | 1542 | def handle_connect(): |
| 1300 | """客户端连接""" | 1543 | """客户端连接""" |
templates/graph_viewer.html
0 → 100644
| 1 | +<!DOCTYPE html> | ||
| 2 | +<html lang="zh-CN"> | ||
| 3 | +<head> | ||
| 4 | + <meta charset="UTF-8"> | ||
| 5 | + <meta name="viewport" content="width=device-width, initial-scale=1.0"> | ||
| 6 | + <title>知识图谱可视化 - BettaFish</title> | ||
| 7 | + <!-- Vis.js --> | ||
| 8 | + <script src="https://unpkg.com/vis-network/standalone/umd/vis-network.min.js"></script> | ||
| 9 | + <style> | ||
| 10 | + :root { | ||
| 11 | + --primary-color: #4F46E5; | ||
| 12 | + --primary-light: #818CF8; | ||
| 13 | + --bg-color: #0F172A; | ||
| 14 | + --card-bg: #1E293B; | ||
| 15 | + --text-color: #F1F5F9; | ||
| 16 | + --text-muted: #94A3B8; | ||
| 17 | + --border-color: #334155; | ||
| 18 | + --success-color: #10B981; | ||
| 19 | + --warning-color: #F59E0B; | ||
| 20 | + --error-color: #EF4444; | ||
| 21 | + } | ||
| 22 | + | ||
| 23 | + * { | ||
| 24 | + margin: 0; | ||
| 25 | + padding: 0; | ||
| 26 | + box-sizing: border-box; | ||
| 27 | + } | ||
| 28 | + | ||
| 29 | + body { | ||
| 30 | + font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, 'Helvetica Neue', Arial, sans-serif; | ||
| 31 | + background-color: var(--bg-color); | ||
| 32 | + color: var(--text-color); | ||
| 33 | + min-height: 100vh; | ||
| 34 | + } | ||
| 35 | + | ||
| 36 | + /* 顶部工具栏 */ | ||
| 37 | + .toolbar { | ||
| 38 | + position: fixed; | ||
| 39 | + top: 0; | ||
| 40 | + left: 0; | ||
| 41 | + right: 0; | ||
| 42 | + height: 60px; | ||
| 43 | + background: var(--card-bg); | ||
| 44 | + border-bottom: 1px solid var(--border-color); | ||
| 45 | + display: flex; | ||
| 46 | + align-items: center; | ||
| 47 | + padding: 0 20px; | ||
| 48 | + gap: 16px; | ||
| 49 | + z-index: 1000; | ||
| 50 | + } | ||
| 51 | + | ||
| 52 | + .toolbar h1 { | ||
| 53 | + font-size: 1.25rem; | ||
| 54 | + font-weight: 600; | ||
| 55 | + display: flex; | ||
| 56 | + align-items: center; | ||
| 57 | + gap: 8px; | ||
| 58 | + } | ||
| 59 | + | ||
| 60 | + .toolbar h1 svg { | ||
| 61 | + width: 24px; | ||
| 62 | + height: 24px; | ||
| 63 | + color: var(--primary-color); | ||
| 64 | + } | ||
| 65 | + | ||
| 66 | + .toolbar-divider { | ||
| 67 | + width: 1px; | ||
| 68 | + height: 30px; | ||
| 69 | + background: var(--border-color); | ||
| 70 | + } | ||
| 71 | + | ||
| 72 | + .btn { | ||
| 73 | + display: flex; | ||
| 74 | + align-items: center; | ||
| 75 | + gap: 6px; | ||
| 76 | + padding: 8px 16px; | ||
| 77 | + border: 1px solid var(--border-color); | ||
| 78 | + border-radius: 6px; | ||
| 79 | + background: transparent; | ||
| 80 | + color: var(--text-color); | ||
| 81 | + cursor: pointer; | ||
| 82 | + font-size: 0.875rem; | ||
| 83 | + transition: all 0.2s; | ||
| 84 | + } | ||
| 85 | + | ||
| 86 | + .btn:hover { | ||
| 87 | + background: var(--primary-color); | ||
| 88 | + border-color: var(--primary-color); | ||
| 89 | + } | ||
| 90 | + | ||
| 91 | + .btn-primary { | ||
| 92 | + background: var(--primary-color); | ||
| 93 | + border-color: var(--primary-color); | ||
| 94 | + } | ||
| 95 | + | ||
| 96 | + .btn svg { | ||
| 97 | + width: 16px; | ||
| 98 | + height: 16px; | ||
| 99 | + } | ||
| 100 | + | ||
| 101 | + .search-box { | ||
| 102 | + flex: 1; | ||
| 103 | + max-width: 400px; | ||
| 104 | + position: relative; | ||
| 105 | + } | ||
| 106 | + | ||
| 107 | + .search-box input { | ||
| 108 | + width: 100%; | ||
| 109 | + padding: 8px 16px 8px 40px; | ||
| 110 | + border: 1px solid var(--border-color); | ||
| 111 | + border-radius: 6px; | ||
| 112 | + background: var(--bg-color); | ||
| 113 | + color: var(--text-color); | ||
| 114 | + font-size: 0.875rem; | ||
| 115 | + } | ||
| 116 | + | ||
| 117 | + .search-box input:focus { | ||
| 118 | + outline: none; | ||
| 119 | + border-color: var(--primary-color); | ||
| 120 | + } | ||
| 121 | + | ||
| 122 | + .search-box svg { | ||
| 123 | + position: absolute; | ||
| 124 | + left: 12px; | ||
| 125 | + top: 50%; | ||
| 126 | + transform: translateY(-50%); | ||
| 127 | + width: 16px; | ||
| 128 | + height: 16px; | ||
| 129 | + color: var(--text-muted); | ||
| 130 | + } | ||
| 131 | + | ||
| 132 | + /* 统计信息 */ | ||
| 133 | + .stats { | ||
| 134 | + display: flex; | ||
| 135 | + gap: 16px; | ||
| 136 | + margin-left: auto; | ||
| 137 | + } | ||
| 138 | + | ||
| 139 | + .stat-item { | ||
| 140 | + display: flex; | ||
| 141 | + align-items: center; | ||
| 142 | + gap: 6px; | ||
| 143 | + font-size: 0.875rem; | ||
| 144 | + } | ||
| 145 | + | ||
| 146 | + .stat-item .label { | ||
| 147 | + color: var(--text-muted); | ||
| 148 | + } | ||
| 149 | + | ||
| 150 | + .stat-item .value { | ||
| 151 | + font-weight: 600; | ||
| 152 | + color: var(--primary-light); | ||
| 153 | + } | ||
| 154 | + | ||
| 155 | + /* 左侧面板 */ | ||
| 156 | + .sidebar { | ||
| 157 | + position: fixed; | ||
| 158 | + top: 60px; | ||
| 159 | + left: 0; | ||
| 160 | + width: 300px; | ||
| 161 | + bottom: 0; | ||
| 162 | + background: var(--card-bg); | ||
| 163 | + border-right: 1px solid var(--border-color); | ||
| 164 | + overflow-y: auto; | ||
| 165 | + padding: 16px; | ||
| 166 | + transition: transform 0.3s; | ||
| 167 | + z-index: 100; | ||
| 168 | + } | ||
| 169 | + | ||
| 170 | + .sidebar.collapsed { | ||
| 171 | + transform: translateX(-100%); | ||
| 172 | + } | ||
| 173 | + | ||
| 174 | + .sidebar h3 { | ||
| 175 | + font-size: 0.875rem; | ||
| 176 | + font-weight: 600; | ||
| 177 | + color: var(--text-muted); | ||
| 178 | + text-transform: uppercase; | ||
| 179 | + letter-spacing: 0.05em; | ||
| 180 | + margin-bottom: 12px; | ||
| 181 | + } | ||
| 182 | + | ||
| 183 | + .filter-group { | ||
| 184 | + margin-bottom: 20px; | ||
| 185 | + } | ||
| 186 | + | ||
| 187 | + .filter-item { | ||
| 188 | + display: flex; | ||
| 189 | + align-items: center; | ||
| 190 | + gap: 10px; | ||
| 191 | + padding: 8px 0; | ||
| 192 | + cursor: pointer; | ||
| 193 | + } | ||
| 194 | + | ||
| 195 | + .filter-item input[type="checkbox"] { | ||
| 196 | + width: 16px; | ||
| 197 | + height: 16px; | ||
| 198 | + accent-color: var(--primary-color); | ||
| 199 | + } | ||
| 200 | + | ||
| 201 | + .filter-item .color-dot { | ||
| 202 | + width: 12px; | ||
| 203 | + height: 12px; | ||
| 204 | + border-radius: 50%; | ||
| 205 | + } | ||
| 206 | + | ||
| 207 | + .filter-item .count { | ||
| 208 | + margin-left: auto; | ||
| 209 | + font-size: 0.75rem; | ||
| 210 | + color: var(--text-muted); | ||
| 211 | + } | ||
| 212 | + | ||
| 213 | + /* 节点详情 */ | ||
| 214 | + .node-detail { | ||
| 215 | + margin-top: 20px; | ||
| 216 | + padding-top: 20px; | ||
| 217 | + border-top: 1px solid var(--border-color); | ||
| 218 | + } | ||
| 219 | + | ||
| 220 | + .node-detail .detail-title { | ||
| 221 | + font-weight: 600; | ||
| 222 | + margin-bottom: 8px; | ||
| 223 | + color: var(--primary-light); | ||
| 224 | + } | ||
| 225 | + | ||
| 226 | + .node-detail .detail-type { | ||
| 227 | + font-size: 0.75rem; | ||
| 228 | + color: var(--text-muted); | ||
| 229 | + margin-bottom: 12px; | ||
| 230 | + } | ||
| 231 | + | ||
| 232 | + .node-detail .detail-props { | ||
| 233 | + font-size: 0.875rem; | ||
| 234 | + } | ||
| 235 | + | ||
| 236 | + .node-detail .prop-item { | ||
| 237 | + padding: 6px 0; | ||
| 238 | + border-bottom: 1px solid var(--border-color); | ||
| 239 | + } | ||
| 240 | + | ||
| 241 | + .node-detail .prop-key { | ||
| 242 | + color: var(--text-muted); | ||
| 243 | + font-size: 0.75rem; | ||
| 244 | + } | ||
| 245 | + | ||
| 246 | + .node-detail .prop-value { | ||
| 247 | + margin-top: 2px; | ||
| 248 | + word-break: break-all; | ||
| 249 | + } | ||
| 250 | + | ||
| 251 | + /* 图谱容器 */ | ||
| 252 | + .graph-container { | ||
| 253 | + position: fixed; | ||
| 254 | + top: 60px; | ||
| 255 | + left: 300px; | ||
| 256 | + right: 0; | ||
| 257 | + bottom: 0; | ||
| 258 | + transition: left 0.3s; | ||
| 259 | + } | ||
| 260 | + | ||
| 261 | + .graph-container.fullwidth { | ||
| 262 | + left: 0; | ||
| 263 | + } | ||
| 264 | + | ||
| 265 | + #network { | ||
| 266 | + width: 100%; | ||
| 267 | + height: 100%; | ||
| 268 | + background: var(--bg-color); | ||
| 269 | + } | ||
| 270 | + | ||
| 271 | + /* 加载状态 */ | ||
| 272 | + .loading-overlay { | ||
| 273 | + position: absolute; | ||
| 274 | + top: 0; | ||
| 275 | + left: 0; | ||
| 276 | + right: 0; | ||
| 277 | + bottom: 0; | ||
| 278 | + display: flex; | ||
| 279 | + flex-direction: column; | ||
| 280 | + align-items: center; | ||
| 281 | + justify-content: center; | ||
| 282 | + background: var(--bg-color); | ||
| 283 | + z-index: 500; | ||
| 284 | + } | ||
| 285 | + | ||
| 286 | + .loading-spinner { | ||
| 287 | + width: 48px; | ||
| 288 | + height: 48px; | ||
| 289 | + border: 4px solid var(--border-color); | ||
| 290 | + border-top-color: var(--primary-color); | ||
| 291 | + border-radius: 50%; | ||
| 292 | + animation: spin 1s linear infinite; | ||
| 293 | + } | ||
| 294 | + | ||
| 295 | + @keyframes spin { | ||
| 296 | + to { transform: rotate(360deg); } | ||
| 297 | + } | ||
| 298 | + | ||
| 299 | + .loading-text { | ||
| 300 | + margin-top: 16px; | ||
| 301 | + color: var(--text-muted); | ||
| 302 | + } | ||
| 303 | + | ||
| 304 | + /* 空状态 */ | ||
| 305 | + .empty-state { | ||
| 306 | + position: absolute; | ||
| 307 | + top: 50%; | ||
| 308 | + left: 50%; | ||
| 309 | + transform: translate(-50%, -50%); | ||
| 310 | + text-align: center; | ||
| 311 | + color: var(--text-muted); | ||
| 312 | + } | ||
| 313 | + | ||
| 314 | + .empty-state svg { | ||
| 315 | + width: 64px; | ||
| 316 | + height: 64px; | ||
| 317 | + margin-bottom: 16px; | ||
| 318 | + opacity: 0.5; | ||
| 319 | + } | ||
| 320 | + | ||
| 321 | + /* 提示信息 */ | ||
| 322 | + .toast { | ||
| 323 | + position: fixed; | ||
| 324 | + bottom: 20px; | ||
| 325 | + right: 20px; | ||
| 326 | + padding: 12px 20px; | ||
| 327 | + background: var(--card-bg); | ||
| 328 | + border: 1px solid var(--border-color); | ||
| 329 | + border-radius: 8px; | ||
| 330 | + display: none; | ||
| 331 | + animation: slideIn 0.3s; | ||
| 332 | + z-index: 2000; | ||
| 333 | + } | ||
| 334 | + | ||
| 335 | + @keyframes slideIn { | ||
| 336 | + from { | ||
| 337 | + transform: translateX(100%); | ||
| 338 | + opacity: 0; | ||
| 339 | + } | ||
| 340 | + } | ||
| 341 | + | ||
| 342 | + /* 图例 */ | ||
| 343 | + .legend { | ||
| 344 | + position: fixed; | ||
| 345 | + bottom: 20px; | ||
| 346 | + left: 320px; | ||
| 347 | + background: var(--card-bg); | ||
| 348 | + border: 1px solid var(--border-color); | ||
| 349 | + border-radius: 8px; | ||
| 350 | + padding: 12px 16px; | ||
| 351 | + display: flex; | ||
| 352 | + gap: 16px; | ||
| 353 | + z-index: 100; | ||
| 354 | + transition: left 0.3s; | ||
| 355 | + } | ||
| 356 | + | ||
| 357 | + .legend.fullwidth { | ||
| 358 | + left: 20px; | ||
| 359 | + } | ||
| 360 | + | ||
| 361 | + .legend-item { | ||
| 362 | + display: flex; | ||
| 363 | + align-items: center; | ||
| 364 | + gap: 6px; | ||
| 365 | + font-size: 0.75rem; | ||
| 366 | + } | ||
| 367 | + | ||
| 368 | + .legend-item .dot { | ||
| 369 | + width: 10px; | ||
| 370 | + height: 10px; | ||
| 371 | + border-radius: 50%; | ||
| 372 | + } | ||
| 373 | + | ||
| 374 | + /* 全屏模式 */ | ||
| 375 | + .fullscreen-btn { | ||
| 376 | + position: fixed; | ||
| 377 | + bottom: 20px; | ||
| 378 | + right: 20px; | ||
| 379 | + z-index: 100; | ||
| 380 | + } | ||
| 381 | + | ||
| 382 | + /* 节点类型颜色 */ | ||
| 383 | + .color-topic { background-color: #EF4444; } | ||
| 384 | + .color-engine { background-color: #F59E0B; } | ||
| 385 | + .color-section { background-color: #10B981; } | ||
| 386 | + .color-search_query { background-color: #3B82F6; } | ||
| 387 | + .color-source { background-color: #8B5CF6; } | ||
| 388 | + </style> | ||
| 389 | +</head> | ||
| 390 | +<body> | ||
| 391 | + <!-- 顶部工具栏 --> | ||
| 392 | + <div class="toolbar"> | ||
| 393 | + <h1> | ||
| 394 | + <svg viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2"> | ||
| 395 | + <circle cx="12" cy="5" r="3"/> | ||
| 396 | + <circle cx="5" cy="19" r="3"/> | ||
| 397 | + <circle cx="19" cy="19" r="3"/> | ||
| 398 | + <line x1="12" y1="8" x2="5" y2="16"/> | ||
| 399 | + <line x1="12" y1="8" x2="19" y2="16"/> | ||
| 400 | + </svg> | ||
| 401 | + 知识图谱 | ||
| 402 | + </h1> | ||
| 403 | + | ||
| 404 | + <div class="toolbar-divider"></div> | ||
| 405 | + | ||
| 406 | + <button class="btn" id="toggleSidebar" title="切换侧边栏"> | ||
| 407 | + <svg viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2"> | ||
| 408 | + <rect x="3" y="3" width="18" height="18" rx="2"/> | ||
| 409 | + <line x1="9" y1="3" x2="9" y2="21"/> | ||
| 410 | + </svg> | ||
| 411 | + </button> | ||
| 412 | + | ||
| 413 | + <button class="btn" id="fitBtn" title="适应视图"> | ||
| 414 | + <svg viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2"> | ||
| 415 | + <path d="M15 3h6v6M9 21H3v-6M21 3l-7 7M3 21l7-7"/> | ||
| 416 | + </svg> | ||
| 417 | + 适应 | ||
| 418 | + </button> | ||
| 419 | + | ||
| 420 | + <button class="btn" id="zoomInBtn" title="放大"> | ||
| 421 | + <svg viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2"> | ||
| 422 | + <circle cx="11" cy="11" r="8"/> | ||
| 423 | + <line x1="21" y1="21" x2="16.65" y2="16.65"/> | ||
| 424 | + <line x1="11" y1="8" x2="11" y2="14"/> | ||
| 425 | + <line x1="8" y1="11" x2="14" y2="11"/> | ||
| 426 | + </svg> | ||
| 427 | + </button> | ||
| 428 | + | ||
| 429 | + <button class="btn" id="zoomOutBtn" title="缩小"> | ||
| 430 | + <svg viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2"> | ||
| 431 | + <circle cx="11" cy="11" r="8"/> | ||
| 432 | + <line x1="21" y1="21" x2="16.65" y2="16.65"/> | ||
| 433 | + <line x1="8" y1="11" x2="14" y2="11"/> | ||
| 434 | + </svg> | ||
| 435 | + </button> | ||
| 436 | + | ||
| 437 | + <div class="search-box"> | ||
| 438 | + <svg viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2"> | ||
| 439 | + <circle cx="11" cy="11" r="8"/> | ||
| 440 | + <line x1="21" y1="21" x2="16.65" y2="16.65"/> | ||
| 441 | + </svg> | ||
| 442 | + <input type="text" id="searchInput" placeholder="搜索节点..."> | ||
| 443 | + </div> | ||
| 444 | + | ||
| 445 | + <div class="stats" id="statsContainer"> | ||
| 446 | + <div class="stat-item"> | ||
| 447 | + <span class="label">节点</span> | ||
| 448 | + <span class="value" id="nodeCount">0</span> | ||
| 449 | + </div> | ||
| 450 | + <div class="stat-item"> | ||
| 451 | + <span class="label">关系</span> | ||
| 452 | + <span class="value" id="edgeCount">0</span> | ||
| 453 | + </div> | ||
| 454 | + </div> | ||
| 455 | + </div> | ||
| 456 | + | ||
| 457 | + <!-- 左侧面板 --> | ||
| 458 | + <div class="sidebar" id="sidebar"> | ||
| 459 | + <div class="filter-group"> | ||
| 460 | + <h3>节点类型</h3> | ||
| 461 | + <label class="filter-item"> | ||
| 462 | + <input type="checkbox" checked data-type="topic"> | ||
| 463 | + <span class="color-dot color-topic"></span> | ||
| 464 | + <span>主题</span> | ||
| 465 | + <span class="count" id="count-topic">0</span> | ||
| 466 | + </label> | ||
| 467 | + <label class="filter-item"> | ||
| 468 | + <input type="checkbox" checked data-type="engine"> | ||
| 469 | + <span class="color-dot color-engine"></span> | ||
| 470 | + <span>分析引擎</span> | ||
| 471 | + <span class="count" id="count-engine">0</span> | ||
| 472 | + </label> | ||
| 473 | + <label class="filter-item"> | ||
| 474 | + <input type="checkbox" checked data-type="section"> | ||
| 475 | + <span class="color-dot color-section"></span> | ||
| 476 | + <span>报告段落</span> | ||
| 477 | + <span class="count" id="count-section">0</span> | ||
| 478 | + </label> | ||
| 479 | + <label class="filter-item"> | ||
| 480 | + <input type="checkbox" checked data-type="search_query"> | ||
| 481 | + <span class="color-dot color-search_query"></span> | ||
| 482 | + <span>搜索关键词</span> | ||
| 483 | + <span class="count" id="count-search_query">0</span> | ||
| 484 | + </label> | ||
| 485 | + <label class="filter-item"> | ||
| 486 | + <input type="checkbox" checked data-type="source"> | ||
| 487 | + <span class="color-dot color-source"></span> | ||
| 488 | + <span>数据来源</span> | ||
| 489 | + <span class="count" id="count-source">0</span> | ||
| 490 | + </label> | ||
| 491 | + </div> | ||
| 492 | + | ||
| 493 | + <div class="node-detail" id="nodeDetail" style="display: none;"> | ||
| 494 | + <h3>节点详情</h3> | ||
| 495 | + <div class="detail-title" id="detailTitle"></div> | ||
| 496 | + <div class="detail-type" id="detailType"></div> | ||
| 497 | + <div class="detail-props" id="detailProps"></div> | ||
| 498 | + </div> | ||
| 499 | + </div> | ||
| 500 | + | ||
| 501 | + <!-- 图谱容器 --> | ||
| 502 | + <div class="graph-container" id="graphContainer"> | ||
| 503 | + <div id="network"></div> | ||
| 504 | + | ||
| 505 | + <!-- 加载状态 --> | ||
| 506 | + <div class="loading-overlay" id="loadingOverlay"> | ||
| 507 | + <div class="loading-spinner"></div> | ||
| 508 | + <div class="loading-text">正在加载知识图谱...</div> | ||
| 509 | + </div> | ||
| 510 | + | ||
| 511 | + <!-- 空状态 --> | ||
| 512 | + <div class="empty-state" id="emptyState" style="display: none;"> | ||
| 513 | + <svg viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="1"> | ||
| 514 | + <circle cx="12" cy="12" r="10"/> | ||
| 515 | + <path d="M8 15h8"/> | ||
| 516 | + <path d="M9 9h.01"/> | ||
| 517 | + <path d="M15 9h.01"/> | ||
| 518 | + </svg> | ||
| 519 | + <h3>暂无图谱数据</h3> | ||
| 520 | + <p>请先生成报告以创建知识图谱</p> | ||
| 521 | + </div> | ||
| 522 | + </div> | ||
| 523 | + | ||
| 524 | + <!-- 图例 --> | ||
| 525 | + <div class="legend" id="legend"> | ||
| 526 | + <div class="legend-item"> | ||
| 527 | + <span class="dot color-topic"></span> | ||
| 528 | + <span>主题</span> | ||
| 529 | + </div> | ||
| 530 | + <div class="legend-item"> | ||
| 531 | + <span class="dot color-engine"></span> | ||
| 532 | + <span>引擎</span> | ||
| 533 | + </div> | ||
| 534 | + <div class="legend-item"> | ||
| 535 | + <span class="dot color-section"></span> | ||
| 536 | + <span>段落</span> | ||
| 537 | + </div> | ||
| 538 | + <div class="legend-item"> | ||
| 539 | + <span class="dot color-search_query"></span> | ||
| 540 | + <span>搜索词</span> | ||
| 541 | + </div> | ||
| 542 | + <div class="legend-item"> | ||
| 543 | + <span class="dot color-source"></span> | ||
| 544 | + <span>来源</span> | ||
| 545 | + </div> | ||
| 546 | + </div> | ||
| 547 | + | ||
| 548 | + <!-- 全屏按钮 --> | ||
| 549 | + <button class="btn fullscreen-btn" id="fullscreenBtn" title="全屏"> | ||
| 550 | + <svg viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2"> | ||
| 551 | + <path d="M8 3H5a2 2 0 00-2 2v3m18 0V5a2 2 0 00-2-2h-3m0 18h3a2 2 0 002-2v-3M3 16v3a2 2 0 002 2h3"/> | ||
| 552 | + </svg> | ||
| 553 | + </button> | ||
| 554 | + | ||
| 555 | + <!-- 提示 --> | ||
| 556 | + <div class="toast" id="toast"></div> | ||
| 557 | + | ||
| 558 | + <script> | ||
| 559 | + // 配置 | ||
| 560 | + const NODE_COLORS = { | ||
| 561 | + topic: '#EF4444', | ||
| 562 | + engine: '#F59E0B', | ||
| 563 | + section: '#10B981', | ||
| 564 | + search_query: '#3B82F6', | ||
| 565 | + source: '#8B5CF6' | ||
| 566 | + }; | ||
| 567 | + | ||
| 568 | + const NODE_SHAPES = { | ||
| 569 | + topic: 'star', | ||
| 570 | + engine: 'diamond', | ||
| 571 | + section: 'dot', | ||
| 572 | + search_query: 'triangle', | ||
| 573 | + source: 'square' | ||
| 574 | + }; | ||
| 575 | + | ||
| 576 | + // 全局变量 | ||
| 577 | + let network = null; | ||
| 578 | + let allNodes = []; | ||
| 579 | + let allEdges = []; | ||
| 580 | + let reportId = {{ report_id | tojson if report_id else 'null' }}; | ||
| 581 | + | ||
| 582 | + // 初始化 | ||
| 583 | + document.addEventListener('DOMContentLoaded', () => { | ||
| 584 | + loadGraphData(); | ||
| 585 | + setupEventListeners(); | ||
| 586 | + }); | ||
| 587 | + | ||
| 588 | + // 加载图谱数据 | ||
| 589 | + async function loadGraphData() { | ||
| 590 | + showLoading(true); | ||
| 591 | + | ||
| 592 | + try { | ||
| 593 | + const url = reportId | ||
| 594 | + ? `/api/graph/${reportId}` | ||
| 595 | + : '/api/graph/latest'; | ||
| 596 | + | ||
| 597 | + const response = await fetch(url); | ||
| 598 | + const data = await response.json(); | ||
| 599 | + | ||
| 600 | + if (data.success && data.graph) { | ||
| 601 | + allNodes = data.graph.nodes; | ||
| 602 | + allEdges = data.graph.edges; | ||
| 603 | + | ||
| 604 | + updateStats(data.graph.stats); | ||
| 605 | + renderGraph(); | ||
| 606 | + showLoading(false); | ||
| 607 | + } else { | ||
| 608 | + showEmpty(true); | ||
| 609 | + showLoading(false); | ||
| 610 | + } | ||
| 611 | + } catch (error) { | ||
| 612 | + console.error('加载图谱失败:', error); | ||
| 613 | + showToast('加载图谱失败: ' + error.message); | ||
| 614 | + showEmpty(true); | ||
| 615 | + showLoading(false); | ||
| 616 | + } | ||
| 617 | + } | ||
| 618 | + | ||
| 619 | + // 渲染图谱 | ||
| 620 | + function renderGraph() { | ||
| 621 | + const container = document.getElementById('network'); | ||
| 622 | + | ||
| 623 | + // 处理节点 | ||
| 624 | + const visibleTypes = getVisibleTypes(); | ||
| 625 | + const filteredNodes = allNodes.filter(n => visibleTypes.includes(n.group)); | ||
| 626 | + const filteredNodeIds = new Set(filteredNodes.map(n => n.id)); | ||
| 627 | + | ||
| 628 | + const nodes = new vis.DataSet(filteredNodes.map(node => ({ | ||
| 629 | + id: node.id, | ||
| 630 | + label: truncateLabel(node.label, 20), | ||
| 631 | + title: node.title, | ||
| 632 | + group: node.group, | ||
| 633 | + color: { | ||
| 634 | + background: NODE_COLORS[node.group] || '#6B7280', | ||
| 635 | + border: NODE_COLORS[node.group] || '#6B7280', | ||
| 636 | + highlight: { | ||
| 637 | + background: lightenColor(NODE_COLORS[node.group] || '#6B7280'), | ||
| 638 | + border: NODE_COLORS[node.group] || '#6B7280' | ||
| 639 | + } | ||
| 640 | + }, | ||
| 641 | + shape: NODE_SHAPES[node.group] || 'dot', | ||
| 642 | + size: node.group === 'topic' ? 30 : (node.group === 'engine' ? 25 : 15), | ||
| 643 | + font: { | ||
| 644 | + color: '#F1F5F9', | ||
| 645 | + size: 12 | ||
| 646 | + }, | ||
| 647 | + // 保存原始数据 | ||
| 648 | + _data: node | ||
| 649 | + }))); | ||
| 650 | + | ||
| 651 | + // 处理边 | ||
| 652 | + const edges = new vis.DataSet(allEdges | ||
| 653 | + .filter(e => filteredNodeIds.has(e.from) && filteredNodeIds.has(e.to)) | ||
| 654 | + .map(edge => ({ | ||
| 655 | + from: edge.from, | ||
| 656 | + to: edge.to, | ||
| 657 | + label: edge.label, | ||
| 658 | + arrows: edge.arrows || 'to', | ||
| 659 | + color: { | ||
| 660 | + color: '#475569', | ||
| 661 | + highlight: '#818CF8' | ||
| 662 | + }, | ||
| 663 | + font: { | ||
| 664 | + color: '#94A3B8', | ||
| 665 | + size: 10, | ||
| 666 | + strokeWidth: 0 | ||
| 667 | + }, | ||
| 668 | + smooth: { | ||
| 669 | + type: 'continuous' | ||
| 670 | + } | ||
| 671 | + })) | ||
| 672 | + ); | ||
| 673 | + | ||
| 674 | + // 图谱配置 | ||
| 675 | + const options = { | ||
| 676 | + nodes: { | ||
| 677 | + borderWidth: 2, | ||
| 678 | + shadow: true | ||
| 679 | + }, | ||
| 680 | + edges: { | ||
| 681 | + width: 1, | ||
| 682 | + shadow: true | ||
| 683 | + }, | ||
| 684 | + physics: { | ||
| 685 | + enabled: true, | ||
| 686 | + solver: 'forceAtlas2Based', | ||
| 687 | + forceAtlas2Based: { | ||
| 688 | + gravitationalConstant: -100, | ||
| 689 | + centralGravity: 0.01, | ||
| 690 | + springLength: 150, | ||
| 691 | + springConstant: 0.08, | ||
| 692 | + damping: 0.5 | ||
| 693 | + }, | ||
| 694 | + stabilization: { | ||
| 695 | + enabled: true, | ||
| 696 | + iterations: 200 | ||
| 697 | + } | ||
| 698 | + }, | ||
| 699 | + interaction: { | ||
| 700 | + hover: true, | ||
| 701 | + tooltipDelay: 100, | ||
| 702 | + zoomView: true, | ||
| 703 | + dragView: true | ||
| 704 | + } | ||
| 705 | + }; | ||
| 706 | + | ||
| 707 | + // 创建网络 | ||
| 708 | + network = new vis.Network(container, { nodes, edges }, options); | ||
| 709 | + | ||
| 710 | + // 节点点击事件 | ||
| 711 | + network.on('click', (params) => { | ||
| 712 | + if (params.nodes.length > 0) { | ||
| 713 | + const nodeId = params.nodes[0]; | ||
| 714 | + const node = allNodes.find(n => n.id === nodeId); | ||
| 715 | + if (node) { | ||
| 716 | + showNodeDetail(node); | ||
| 717 | + } | ||
| 718 | + } else { | ||
| 719 | + hideNodeDetail(); | ||
| 720 | + } | ||
| 721 | + }); | ||
| 722 | + | ||
| 723 | + // 稳定后适应视图 | ||
| 724 | + network.once('stabilizationIterationsDone', () => { | ||
| 725 | + network.fit({ animation: true }); | ||
| 726 | + }); | ||
| 727 | + } | ||
| 728 | + | ||
| 729 | + // 显示节点详情 | ||
| 730 | + function showNodeDetail(node) { | ||
| 731 | + const detailPanel = document.getElementById('nodeDetail'); | ||
| 732 | + const titleEl = document.getElementById('detailTitle'); | ||
| 733 | + const typeEl = document.getElementById('detailType'); | ||
| 734 | + const propsEl = document.getElementById('detailProps'); | ||
| 735 | + | ||
| 736 | + titleEl.textContent = node.label; | ||
| 737 | + | ||
| 738 | + const typeLabels = { | ||
| 739 | + topic: '主题', | ||
| 740 | + engine: '分析引擎', | ||
| 741 | + section: '报告段落', | ||
| 742 | + search_query: '搜索关键词', | ||
| 743 | + source: '数据来源' | ||
| 744 | + }; | ||
| 745 | + typeEl.textContent = typeLabels[node.group] || node.group; | ||
| 746 | + | ||
| 747 | + // 显示属性 | ||
| 748 | + let propsHtml = ''; | ||
| 749 | + const props = node.properties || {}; | ||
| 750 | + for (const [key, value] of Object.entries(props)) { | ||
| 751 | + if (value) { | ||
| 752 | + propsHtml += ` | ||
| 753 | + <div class="prop-item"> | ||
| 754 | + <div class="prop-key">${key}</div> | ||
| 755 | + <div class="prop-value">${truncateText(String(value), 200)}</div> | ||
| 756 | + </div> | ||
| 757 | + `; | ||
| 758 | + } | ||
| 759 | + } | ||
| 760 | + propsEl.innerHTML = propsHtml || '<div class="prop-item">无附加属性</div>'; | ||
| 761 | + | ||
| 762 | + detailPanel.style.display = 'block'; | ||
| 763 | + } | ||
| 764 | + | ||
| 765 | + // 隐藏节点详情 | ||
| 766 | + function hideNodeDetail() { | ||
| 767 | + document.getElementById('nodeDetail').style.display = 'none'; | ||
| 768 | + } | ||
| 769 | + | ||
| 770 | + // 更新统计 | ||
| 771 | + function updateStats(stats) { | ||
| 772 | + document.getElementById('nodeCount').textContent = stats.total_nodes || 0; | ||
| 773 | + document.getElementById('edgeCount').textContent = stats.total_edges || 0; | ||
| 774 | + | ||
| 775 | + // 更新各类型计数 | ||
| 776 | + document.getElementById('count-topic').textContent = stats.topic || 0; | ||
| 777 | + document.getElementById('count-engine').textContent = stats.engine || 0; | ||
| 778 | + document.getElementById('count-section').textContent = stats.section || 0; | ||
| 779 | + document.getElementById('count-search_query').textContent = stats.search_query || 0; | ||
| 780 | + document.getElementById('count-source').textContent = stats.source || 0; | ||
| 781 | + } | ||
| 782 | + | ||
| 783 | + // 获取可见类型 | ||
| 784 | + function getVisibleTypes() { | ||
| 785 | + const types = []; | ||
| 786 | + document.querySelectorAll('.filter-item input[type="checkbox"]').forEach(cb => { | ||
| 787 | + if (cb.checked) { | ||
| 788 | + types.push(cb.dataset.type); | ||
| 789 | + } | ||
| 790 | + }); | ||
| 791 | + return types; | ||
| 792 | + } | ||
| 793 | + | ||
| 794 | + // 设置事件监听 | ||
| 795 | + function setupEventListeners() { | ||
| 796 | + // 侧边栏切换 | ||
| 797 | + document.getElementById('toggleSidebar').addEventListener('click', () => { | ||
| 798 | + const sidebar = document.getElementById('sidebar'); | ||
| 799 | + const container = document.getElementById('graphContainer'); | ||
| 800 | + const legend = document.getElementById('legend'); | ||
| 801 | + | ||
| 802 | + sidebar.classList.toggle('collapsed'); | ||
| 803 | + container.classList.toggle('fullwidth'); | ||
| 804 | + legend.classList.toggle('fullwidth'); | ||
| 805 | + }); | ||
| 806 | + | ||
| 807 | + // 适应视图 | ||
| 808 | + document.getElementById('fitBtn').addEventListener('click', () => { | ||
| 809 | + if (network) network.fit({ animation: true }); | ||
| 810 | + }); | ||
| 811 | + | ||
| 812 | + // 放大 | ||
| 813 | + document.getElementById('zoomInBtn').addEventListener('click', () => { | ||
| 814 | + if (network) { | ||
| 815 | + const scale = network.getScale() * 1.2; | ||
| 816 | + network.moveTo({ scale, animation: true }); | ||
| 817 | + } | ||
| 818 | + }); | ||
| 819 | + | ||
| 820 | + // 缩小 | ||
| 821 | + document.getElementById('zoomOutBtn').addEventListener('click', () => { | ||
| 822 | + if (network) { | ||
| 823 | + const scale = network.getScale() / 1.2; | ||
| 824 | + network.moveTo({ scale, animation: true }); | ||
| 825 | + } | ||
| 826 | + }); | ||
| 827 | + | ||
| 828 | + // 全屏 | ||
| 829 | + document.getElementById('fullscreenBtn').addEventListener('click', () => { | ||
| 830 | + if (!document.fullscreenElement) { | ||
| 831 | + document.documentElement.requestFullscreen(); | ||
| 832 | + } else { | ||
| 833 | + document.exitFullscreen(); | ||
| 834 | + } | ||
| 835 | + }); | ||
| 836 | + | ||
| 837 | + // 搜索 | ||
| 838 | + document.getElementById('searchInput').addEventListener('input', (e) => { | ||
| 839 | + const query = e.target.value.toLowerCase(); | ||
| 840 | + if (!query) { | ||
| 841 | + if (network) network.selectNodes([]); | ||
| 842 | + return; | ||
| 843 | + } | ||
| 844 | + | ||
| 845 | + const matchedIds = allNodes | ||
| 846 | + .filter(n => n.label.toLowerCase().includes(query)) | ||
| 847 | + .map(n => n.id); | ||
| 848 | + | ||
| 849 | + if (network && matchedIds.length > 0) { | ||
| 850 | + network.selectNodes(matchedIds); | ||
| 851 | + network.focus(matchedIds[0], { animation: true, scale: 1.5 }); | ||
| 852 | + } | ||
| 853 | + }); | ||
| 854 | + | ||
| 855 | + // 筛选 | ||
| 856 | + document.querySelectorAll('.filter-item input[type="checkbox"]').forEach(cb => { | ||
| 857 | + cb.addEventListener('change', () => { | ||
| 858 | + renderGraph(); | ||
| 859 | + }); | ||
| 860 | + }); | ||
| 861 | + } | ||
| 862 | + | ||
| 863 | + // 辅助函数 | ||
| 864 | + function showLoading(show) { | ||
| 865 | + document.getElementById('loadingOverlay').style.display = show ? 'flex' : 'none'; | ||
| 866 | + } | ||
| 867 | + | ||
| 868 | + function showEmpty(show) { | ||
| 869 | + document.getElementById('emptyState').style.display = show ? 'block' : 'none'; | ||
| 870 | + } | ||
| 871 | + | ||
| 872 | + function showToast(message) { | ||
| 873 | + const toast = document.getElementById('toast'); | ||
| 874 | + toast.textContent = message; | ||
| 875 | + toast.style.display = 'block'; | ||
| 876 | + setTimeout(() => { | ||
| 877 | + toast.style.display = 'none'; | ||
| 878 | + }, 3000); | ||
| 879 | + } | ||
| 880 | + | ||
| 881 | + function truncateLabel(text, maxLen) { | ||
| 882 | + if (!text) return ''; | ||
| 883 | + return text.length > maxLen ? text.slice(0, maxLen) + '...' : text; | ||
| 884 | + } | ||
| 885 | + | ||
| 886 | + function truncateText(text, maxLen) { | ||
| 887 | + if (!text) return ''; | ||
| 888 | + return text.length > maxLen ? text.slice(0, maxLen) + '...' : text; | ||
| 889 | + } | ||
| 890 | + | ||
| 891 | + function lightenColor(color) { | ||
| 892 | + // 简单的颜色变亮 | ||
| 893 | + const hex = color.replace('#', ''); | ||
| 894 | + const r = Math.min(255, parseInt(hex.slice(0, 2), 16) + 40); | ||
| 895 | + const g = Math.min(255, parseInt(hex.slice(2, 4), 16) + 40); | ||
| 896 | + const b = Math.min(255, parseInt(hex.slice(4, 6), 16) + 40); | ||
| 897 | + return `rgb(${r}, ${g}, ${b})`; | ||
| 898 | + } | ||
| 899 | + </script> | ||
| 900 | +</body> | ||
| 901 | +</html> |
-
Please register or login to post a comment