Doiiars

修复数据库初始化问题、删除过时的数据库格式方法、修复空停用词错误。

@@ -35,6 +35,8 @@ class AsyncWordCloudGenerator: @@ -35,6 +35,8 @@ class AsyncWordCloudGenerator:
35 jieba.add_word(word) 35 jieba.add_word(word)
36 36
37 def load_stop_words(self): 37 def load_stop_words(self):
  38 + if not os.path.exists(self.stop_words_file):
  39 + return set()
38 with open(self.stop_words_file, 'r', encoding='utf-8') as f: 40 with open(self.stop_words_file, 'r', encoding='utf-8') as f:
39 return set(f.read().strip().split('\n')) 41 return set(f.read().strip().split('\n'))
40 42
@@ -24,7 +24,7 @@ except ImportError: @@ -24,7 +24,7 @@ except ImportError:
24 logger.error("错误: 无法导入config.py配置文件") 24 logger.error("错误: 无法导入config.py配置文件")
25 sys.exit(1) 25 sys.exit(1)
26 26
27 -from MindSpider.config import settings 27 +from config import settings
28 28
29 class DatabaseManager: 29 class DatabaseManager:
30 def __init__(self): 30 def __init__(self):
1 -#!/usr/bin/env python3  
2 -# -*- coding: utf-8 -*-  
3 """ 1 """
4 -MindSpider AI爬虫项目 - 数据库初始化脚本  
5 -用于创建项目所需的所有数据库表 2 +MindSpider 数据库初始化(SQLAlchemy 2.x 异步引擎)
  3 +
  4 +此脚本创建 MindSpider 扩展表(与 MediaCrawler 原始表分离)。
  5 +支持 MySQL 与 PostgreSQL,需已有可连接的数据库实例。
  6 +
  7 +数据模型定义位置:
  8 +- MindSpider/schema/models_sa.py
6 """ 9 """
7 10
  11 +from __future__ import annotations
  12 +
  13 +import asyncio
8 import os 14 import os
  15 +from typing import Optional
  16 +
  17 +from loguru import logger
  18 +
  19 +from sqlalchemy.ext.asyncio import create_async_engine
  20 +from sqlalchemy import text
  21 +
  22 +from models_sa import Base
  23 +
  24 +# 导入 models_bigdata 以确保所有表类被注册到 Base.metadata
  25 +# models_bigdata 现在也使用 models_sa 的 Base,所以所有表都在同一个 metadata 中
  26 +import models_bigdata # noqa: F401 # 导入以注册所有表类
9 import sys 27 import sys
10 -import pymysql  
11 from pathlib import Path 28 from pathlib import Path
12 -from MindSpider.config import settings  
13 29
14 # 添加项目根目录到路径 30 # 添加项目根目录到路径
15 project_root = Path(__file__).parent.parent 31 project_root = Path(__file__).parent.parent
16 sys.path.append(str(project_root)) 32 sys.path.append(str(project_root))
17 33
18 -# 导入配置  
19 -try:  
20 - import config  
21 -except ImportError:  
22 - print("错误: 无法导入config.py配置文件")  
23 - print("请确保config.py文件存在于项目根目录")  
24 - sys.exit(1)  
25 -  
26 -def create_database_connection():  
27 - """创建数据库连接"""  
28 - try:  
29 - connection = pymysql.connect(  
30 - host=settings.db_host,  
31 - port=settings.db_port,  
32 - user=settings.db_user,  
33 - password=settings.db_password,  
34 - charset=settings.db_charset,  
35 - autocommit=True 34 +from config import settings
  35 +
  36 +def _env(key: str, default: Optional[str] = None) -> Optional[str]:
  37 + v = os.getenv(key)
  38 + return v if v not in (None, "") else default
  39 +
  40 +
  41 +def _build_database_url() -> str:
  42 + # 优先 DATABASE_URL
  43 + database_url = settings.DATABASE_URL if hasattr(settings, "DATABASE_URL") else None
  44 + if database_url:
  45 + return database_url
  46 +
  47 + dialect = (settings.DB_DIALECT or "mysql").lower()
  48 + host = settings.DB_HOST or "localhost"
  49 + port = str(settings.DB_PORT or ("3306" if dialect == "mysql" else "5432"))
  50 + user = settings.DB_USER or "root"
  51 + password = settings.DB_PASSWORD or ""
  52 + db_name = settings.DB_NAME or "mindspider"
  53 +
  54 + if dialect in ("postgresql", "postgres"):
  55 + return f"postgresql+asyncpg://{user}:{password}@{host}:{port}/{db_name}"
  56 +
  57 + return f"mysql+aiomysql://{user}:{password}@{host}:{port}/{db_name}"
  58 +
  59 +
  60 +async def _create_views_if_needed(engine_dialect: str):
  61 + # 视图为可选;仅当业务需要时创建。两端使用通用 SQL 聚合避免方言函数。
  62 + # 如不需要视图,可跳过。
  63 + engine_dialect = engine_dialect.lower()
  64 + v_topic_crawling_stats = (
  65 + "CREATE OR REPLACE VIEW v_topic_crawling_stats AS\n"
  66 + "SELECT dt.topic_id, dt.topic_name, dt.extract_date, dt.processing_status,\n"
  67 + " COUNT(DISTINCT ct.task_id) AS total_tasks,\n"
  68 + " SUM(CASE WHEN ct.task_status = 'completed' THEN 1 ELSE 0 END) AS completed_tasks,\n"
  69 + " SUM(CASE WHEN ct.task_status = 'failed' THEN 1 ELSE 0 END) AS failed_tasks,\n"
  70 + " SUM(COALESCE(ct.total_crawled,0)) AS total_content_crawled,\n"
  71 + " SUM(COALESCE(ct.success_count,0)) AS total_success_count,\n"
  72 + " SUM(COALESCE(ct.error_count,0)) AS total_error_count\n"
  73 + "FROM daily_topics dt\n"
  74 + "LEFT JOIN crawling_tasks ct ON dt.topic_id = ct.topic_id\n"
  75 + "GROUP BY dt.topic_id, dt.topic_name, dt.extract_date, dt.processing_status"
  76 + )
  77 +
  78 + v_daily_summary = (
  79 + "CREATE OR REPLACE VIEW v_daily_summary AS\n"
  80 + "SELECT dn.crawl_date AS crawl_date,\n"
  81 + " COUNT(DISTINCT dn.news_id) AS total_news,\n"
  82 + " COUNT(DISTINCT dn.source_platform) AS platforms_covered,\n"
  83 + " (SELECT COUNT(*) FROM daily_topics WHERE extract_date = dn.crawl_date) AS topics_extracted,\n"
  84 + " (SELECT COUNT(*) FROM crawling_tasks WHERE scheduled_date = dn.crawl_date) AS tasks_created\n"
  85 + "FROM daily_news dn\n"
  86 + "GROUP BY dn.crawl_date\n"
  87 + "ORDER BY dn.crawl_date DESC"
36 ) 88 )
37 - print(f"成功连接到MySQL服务器: {settings.db_host}:{settings.db_port}")  
38 - return connection  
39 - except Exception as e:  
40 - print(f"连接数据库失败: {e}")  
41 - return None  
42 -  
43 -def create_database(connection):  
44 - """创建数据库"""  
45 - try:  
46 - cursor = connection.cursor()  
47 - cursor.execute(f"CREATE DATABASE IF NOT EXISTS `{settings.db_name}` CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci")  
48 - cursor.execute(f"USE `{settings.db_name}`")  
49 - print(f"数据库 '{settings.db_name}' 创建/选择成功")  
50 - return True  
51 - except Exception as e:  
52 - print(f"创建数据库失败: {e}")  
53 - return False  
54 -  
55 -def execute_sql_file(connection, sql_file_path, description=""):  
56 - """执行SQL文件"""  
57 - if not os.path.exists(sql_file_path):  
58 - print(f"警告: SQL文件不存在: {sql_file_path}")  
59 - return False  
60 -  
61 - try:  
62 - cursor = connection.cursor()  
63 - with open(sql_file_path, 'r', encoding='utf-8') as f:  
64 - sql_content = f.read()  
65 -  
66 - # 分割SQL语句(简单实现,按分号分割)  
67 - sql_statements = [stmt.strip() for stmt in sql_content.split(';') if stmt.strip()]  
68 -  
69 - success_count = 0  
70 - error_count = 0  
71 -  
72 - for stmt in sql_statements:  
73 - if not stmt or stmt.startswith('--'):  
74 - continue  
75 - try:  
76 - cursor.execute(stmt)  
77 - success_count += 1  
78 - except Exception as e:  
79 - error_count += 1  
80 - print(f"执行SQL语句失败: {str(e)[:100]}...")  
81 -  
82 - print(f"{description} - 成功执行: {success_count} 条语句, 失败: {error_count} 条语句")  
83 - return error_count == 0  
84 -  
85 - except Exception as e:  
86 - print(f"执行SQL文件失败 {sql_file_path}: {e}")  
87 - return False  
88 -  
89 -def main():  
90 - """主函数"""  
91 - print("=" * 60)  
92 - print("MindSpider AI爬虫项目 - 数据库初始化")  
93 - print("=" * 60)  
94 -  
95 - # 检查配置  
96 - print("检查数据库配置...")  
97 - print(f"数据库主机: {settings.db_host}")  
98 - print(f"数据库端口: {settings.db_port}")  
99 - print(f"数据库名称: {settings.db_name}")  
100 - print(f"数据库用户: {settings.db_user}")  
101 - print(f"字符集: {settings.db_charset}")  
102 - print()  
103 -  
104 - # 创建数据库连接  
105 - print("正在连接数据库...")  
106 - connection = create_database_connection()  
107 - if not connection:  
108 - print("数据库初始化失败!")  
109 - return False  
110 -  
111 - try:  
112 - # 创建数据库  
113 - print("正在创建/选择数据库...")  
114 - if not create_database(connection):  
115 - return False  
116 -  
117 - # 获取SQL文件路径  
118 - schema_dir = Path(__file__).parent  
119 - mediacrawler_sql = schema_dir.parent / "DeepSentimentCrawling" / "MediaCrawler" / "schema" / "tables.sql"  
120 - mindspider_sql = schema_dir / "mindspider_tables.sql"  
121 -  
122 - print()  
123 - print("开始执行SQL脚本...")  
124 -  
125 - # 1. 执行MediaCrawler的原始表结构  
126 - if mediacrawler_sql.exists():  
127 - print("1. 创建MediaCrawler基础表...")  
128 - execute_sql_file(connection, str(mediacrawler_sql), "MediaCrawler基础表")  
129 - else:  
130 - print("警告: MediaCrawler SQL文件不存在,跳过基础表创建")  
131 -  
132 - # 2. 执行MindSpider扩展表结构  
133 - print("2. 创建MindSpider扩展表...")  
134 - if mindspider_sql.exists():  
135 - execute_sql_file(connection, str(mindspider_sql), "MindSpider扩展表")  
136 - else:  
137 - print("错误: MindSpider SQL文件不存在")  
138 - return False  
139 -  
140 - print()  
141 - print("=" * 60)  
142 - print("数据库初始化完成!")  
143 - print("=" * 60)  
144 -  
145 - # 显示创建的表  
146 - cursor = connection.cursor()  
147 - cursor.execute("SHOW TABLES")  
148 - tables = cursor.fetchall()  
149 -  
150 - print(f"数据库 '{settings.db_name}' 中共创建了 {len(tables)} 个表:")  
151 - for table in tables:  
152 - print(f" - {table[0]}")  
153 -  
154 - print()  
155 - print("数据库初始化成功完成!您现在可以开始使用MindSpider了。")  
156 - return True  
157 -  
158 - except Exception as e:  
159 - print(f"数据库初始化过程中发生错误: {e}")  
160 - return False  
161 -  
162 - finally:  
163 - if connection:  
164 - connection.close()  
165 - print("数据库连接已关闭") 89 +
  90 + # PostgreSQL 的 CREATE OR REPLACE VIEW 也可用;两端均执行
  91 + from sqlalchemy.ext.asyncio import AsyncEngine
  92 + engine: AsyncEngine = create_async_engine(_build_database_url())
  93 + async with engine.begin() as conn:
  94 + await conn.execute(text(v_topic_crawling_stats))
  95 + await conn.execute(text(v_daily_summary))
  96 + await engine.dispose()
  97 +
  98 +
  99 +async def main() -> None:
  100 + database_url = _build_database_url()
  101 + engine = create_async_engine(database_url, pool_pre_ping=True, pool_recycle=1800)
  102 +
  103 + # 由于 models_bigdata 和 models_sa 现在共享同一个 Base,所有表都在同一个 metadata 中
  104 + # 只需创建一次,SQLAlchemy 会自动处理表之间的依赖关系
  105 + async with engine.begin() as conn:
  106 + await conn.run_sync(Base.metadata.create_all)
  107 +
  108 + # 保持原有视图创建和释放逻辑
  109 + dialect_name = engine.url.get_backend_name()
  110 + await _create_views_if_needed(dialect_name)
  111 +
  112 + await engine.dispose()
  113 + logger.info("[init_database_sa] 数据表与视图创建完成")
  114 +
166 115
167 if __name__ == "__main__": 116 if __name__ == "__main__":
168 - success = main()  
169 - sys.exit(0 if success else 1) 117 + asyncio.run(main())
  118 +
  119 +
1 -"""  
2 -MindSpider 数据库初始化(SQLAlchemy 2.x 异步引擎)  
3 -  
4 -此脚本创建 MindSpider 扩展表(与 MediaCrawler 原始表分离)。  
5 -支持 MySQL 与 PostgreSQL,需已有可连接的数据库实例。  
6 -  
7 -数据模型定义位置:  
8 -- MindSpider/schema/models_sa.py  
9 -"""  
10 -  
11 -from __future__ import annotations  
12 -  
13 -import asyncio  
14 -import os  
15 -from typing import Optional  
16 -  
17 -from loguru import logger  
18 -  
19 -from sqlalchemy.ext.asyncio import create_async_engine  
20 -from sqlalchemy import text  
21 -  
22 -from models_sa import Base  
23 -  
24 -# 导入 models_bigdata 以确保所有表类被注册到 Base.metadata  
25 -# models_bigdata 现在也使用 models_sa 的 Base,所以所有表都在同一个 metadata 中  
26 -import models_bigdata # noqa: F401 # 导入以注册所有表类  
27 -import sys  
28 -from pathlib import Path  
29 -  
30 -# 添加项目根目录到路径  
31 -project_root = Path(__file__).parent.parent  
32 -sys.path.append(str(project_root))  
33 -  
34 -from config import settings  
35 -  
36 -def _env(key: str, default: Optional[str] = None) -> Optional[str]:  
37 - v = os.getenv(key)  
38 - return v if v not in (None, "") else default  
39 -  
40 -  
41 -def _build_database_url() -> str:  
42 - # 优先 DATABASE_URL  
43 - database_url = settings.DATABASE_URL if hasattr(settings, "DATABASE_URL") else None  
44 - if database_url:  
45 - return database_url  
46 -  
47 - dialect = (settings.DB_DIALECT or "mysql").lower()  
48 - host = settings.DB_HOST or "localhost"  
49 - port = str(settings.DB_PORT or ("3306" if dialect == "mysql" else "5432"))  
50 - user = settings.DB_USER or "root"  
51 - password = settings.DB_PASSWORD or ""  
52 - db_name = settings.DB_NAME or "mindspider"  
53 -  
54 - if dialect in ("postgresql", "postgres"):  
55 - return f"postgresql+asyncpg://{user}:{password}@{host}:{port}/{db_name}"  
56 -  
57 - return f"mysql+aiomysql://{user}:{password}@{host}:{port}/{db_name}"  
58 -  
59 -  
60 -async def _create_views_if_needed(engine_dialect: str):  
61 - # 视图为可选;仅当业务需要时创建。两端使用通用 SQL 聚合避免方言函数。  
62 - # 如不需要视图,可跳过。  
63 - engine_dialect = engine_dialect.lower()  
64 - v_topic_crawling_stats = (  
65 - "CREATE OR REPLACE VIEW v_topic_crawling_stats AS\n"  
66 - "SELECT dt.topic_id, dt.topic_name, dt.extract_date, dt.processing_status,\n"  
67 - " COUNT(DISTINCT ct.task_id) AS total_tasks,\n"  
68 - " SUM(CASE WHEN ct.task_status = 'completed' THEN 1 ELSE 0 END) AS completed_tasks,\n"  
69 - " SUM(CASE WHEN ct.task_status = 'failed' THEN 1 ELSE 0 END) AS failed_tasks,\n"  
70 - " SUM(COALESCE(ct.total_crawled,0)) AS total_content_crawled,\n"  
71 - " SUM(COALESCE(ct.success_count,0)) AS total_success_count,\n"  
72 - " SUM(COALESCE(ct.error_count,0)) AS total_error_count\n"  
73 - "FROM daily_topics dt\n"  
74 - "LEFT JOIN crawling_tasks ct ON dt.topic_id = ct.topic_id\n"  
75 - "GROUP BY dt.topic_id, dt.topic_name, dt.extract_date, dt.processing_status"  
76 - )  
77 -  
78 - v_daily_summary = (  
79 - "CREATE OR REPLACE VIEW v_daily_summary AS\n"  
80 - "SELECT dn.crawl_date AS crawl_date,\n"  
81 - " COUNT(DISTINCT dn.news_id) AS total_news,\n"  
82 - " COUNT(DISTINCT dn.source_platform) AS platforms_covered,\n"  
83 - " (SELECT COUNT(*) FROM daily_topics WHERE extract_date = dn.crawl_date) AS topics_extracted,\n"  
84 - " (SELECT COUNT(*) FROM crawling_tasks WHERE scheduled_date = dn.crawl_date) AS tasks_created\n"  
85 - "FROM daily_news dn\n"  
86 - "GROUP BY dn.crawl_date\n"  
87 - "ORDER BY dn.crawl_date DESC"  
88 - )  
89 -  
90 - # PostgreSQL 的 CREATE OR REPLACE VIEW 也可用;两端均执行  
91 - from sqlalchemy.ext.asyncio import AsyncEngine  
92 - engine: AsyncEngine = create_async_engine(_build_database_url())  
93 - async with engine.begin() as conn:  
94 - await conn.execute(text(v_topic_crawling_stats))  
95 - await conn.execute(text(v_daily_summary))  
96 - await engine.dispose()  
97 -  
98 -  
99 -async def main() -> None:  
100 - database_url = _build_database_url()  
101 - engine = create_async_engine(database_url, pool_pre_ping=True, pool_recycle=1800)  
102 -  
103 - # 由于 models_bigdata 和 models_sa 现在共享同一个 Base,所有表都在同一个 metadata 中  
104 - # 只需创建一次,SQLAlchemy 会自动处理表之间的依赖关系  
105 - async with engine.begin() as conn:  
106 - await conn.run_sync(Base.metadata.create_all)  
107 -  
108 - # 保持原有视图创建和释放逻辑  
109 - dialect_name = engine.url.get_backend_name()  
110 - await _create_views_if_needed(dialect_name)  
111 -  
112 - await engine.dispose()  
113 - logger.info("[init_database_sa] 数据表与视图创建完成")  
114 -  
115 -  
116 -if __name__ == "__main__":  
117 - asyncio.run(main())  
118 -  
119 -  
@@ -34,6 +34,8 @@ pymysql==1.1.0 @@ -34,6 +34,8 @@ pymysql==1.1.0
34 aiomysql==0.2.0 34 aiomysql==0.2.0
35 aiosqlite==0.21.0 35 aiosqlite==0.21.0
36 redis>=4.6.0 36 redis>=4.6.0
  37 +SQLAlchemy==2.0.35
  38 +asyncpg==0.29.0
37 39
38 # ===== 爬虫相关 ===== 40 # ===== 爬虫相关 =====
39 playwright==1.45.0 41 playwright==1.45.0
@@ -64,6 +66,7 @@ tqdm>=4.65.0 @@ -64,6 +66,7 @@ tqdm>=4.65.0
64 tenacity==8.2.2 66 tenacity==8.2.2
65 loguru>=0.7.0 67 loguru>=0.7.0
66 pydantic==2.5.2 68 pydantic==2.5.2
  69 +pydantic-settings==2.2.1
67 70
68 # ===== 开发工具(可选) ===== 71 # ===== 开发工具(可选) =====
69 pytest>=7.4.0 72 pytest>=7.4.0