mindspider_tables.sql
9.63 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
-- MindSpider AI爬虫项目 - 数据库表结构
-- 基于MediaCrawler表结构扩展,添加BroadTopicExtraction模块所需表
-- ===============================
-- BroadTopicExtraction 模块表结构
-- ===============================
-- ----------------------------
-- Table structure for daily_news
-- 每日新闻表:存储get_today_news.py获取的热点新闻
-- ----------------------------
DROP TABLE IF EXISTS `daily_news`;
CREATE TABLE `daily_news` (
`id` int NOT NULL AUTO_INCREMENT COMMENT '自增ID',
`news_id` varchar(128) NOT NULL COMMENT '新闻唯一ID',
`source_platform` varchar(32) NOT NULL COMMENT '新闻源平台(weibo|zhihu|bilibili|toutiao|douyin等)',
`title` varchar(500) NOT NULL COMMENT '新闻标题',
`url` varchar(512) DEFAULT NULL COMMENT '新闻链接',
`description` text COMMENT '新闻描述或摘要',
`extra_info` text COMMENT '额外信息(JSON格式存储)',
`crawl_date` date NOT NULL COMMENT '爬取日期',
`rank_position` int DEFAULT NULL COMMENT '在热榜中的排名位置',
`add_ts` bigint NOT NULL COMMENT '记录添加时间戳',
`last_modify_ts` bigint NOT NULL COMMENT '记录最后修改时间戳',
PRIMARY KEY (`id`),
UNIQUE KEY `idx_daily_news_unique` (`news_id`, `source_platform`, `crawl_date`),
KEY `idx_daily_news_date` (`crawl_date`),
KEY `idx_daily_news_platform` (`source_platform`),
KEY `idx_daily_news_rank` (`rank_position`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci COMMENT='每日热点新闻表';
-- ----------------------------
-- Table structure for daily_topics
-- 每日话题表:存储TopicGPT提取的话题信息
-- ----------------------------
DROP TABLE IF EXISTS `daily_topics`;
CREATE TABLE `daily_topics` (
`id` int NOT NULL AUTO_INCREMENT COMMENT '自增ID',
`topic_id` varchar(64) NOT NULL COMMENT '话题唯一ID',
`topic_name` varchar(255) NOT NULL COMMENT '话题名称',
`topic_description` text COMMENT '话题描述',
`keywords` text COMMENT '话题关键词(JSON格式存储)',
`extract_date` date NOT NULL COMMENT '话题提取日期',
`relevance_score` float DEFAULT NULL COMMENT '话题相关性得分',
`news_count` int DEFAULT 0 COMMENT '关联的新闻数量',
`processing_status` varchar(16) DEFAULT 'pending' COMMENT '处理状态(pending|processing|completed|failed)',
`add_ts` bigint NOT NULL COMMENT '记录添加时间戳',
`last_modify_ts` bigint NOT NULL COMMENT '记录最后修改时间戳',
PRIMARY KEY (`id`),
UNIQUE KEY `idx_daily_topics_unique` (`topic_id`, `extract_date`),
KEY `idx_daily_topics_date` (`extract_date`),
KEY `idx_daily_topics_status` (`processing_status`),
KEY `idx_daily_topics_score` (`relevance_score`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci COMMENT='每日提取话题表';
-- ----------------------------
-- Table structure for topic_news_relation
-- 话题新闻关联表:记录话题和新闻的关联关系
-- ----------------------------
DROP TABLE IF EXISTS `topic_news_relation`;
CREATE TABLE `topic_news_relation` (
`id` int NOT NULL AUTO_INCREMENT COMMENT '自增ID',
`topic_id` varchar(64) NOT NULL COMMENT '话题ID',
`news_id` varchar(128) NOT NULL COMMENT '新闻ID',
`relation_score` float DEFAULT NULL COMMENT '关联度得分',
`extract_date` date NOT NULL COMMENT '关联提取日期',
`add_ts` bigint NOT NULL COMMENT '记录添加时间戳',
PRIMARY KEY (`id`),
UNIQUE KEY `idx_topic_news_unique` (`topic_id`, `news_id`, `extract_date`),
KEY `idx_topic_news_topic` (`topic_id`),
KEY `idx_topic_news_news` (`news_id`),
KEY `idx_topic_news_date` (`extract_date`),
FOREIGN KEY (`topic_id`) REFERENCES `daily_topics`(`topic_id`) ON DELETE CASCADE,
FOREIGN KEY (`news_id`) REFERENCES `daily_news`(`news_id`) ON DELETE CASCADE
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci COMMENT='话题新闻关联表';
-- ----------------------------
-- Table structure for crawling_tasks
-- 爬取任务表:记录基于话题的平台爬取任务
-- ----------------------------
DROP TABLE IF EXISTS `crawling_tasks`;
CREATE TABLE `crawling_tasks` (
`id` int NOT NULL AUTO_INCREMENT COMMENT '自增ID',
`task_id` varchar(64) NOT NULL COMMENT '任务唯一ID',
`topic_id` varchar(64) NOT NULL COMMENT '关联的话题ID',
`platform` varchar(32) NOT NULL COMMENT '目标平台(xhs|dy|ks|bili|wb|tieba|zhihu)',
`search_keywords` text NOT NULL COMMENT '搜索关键词(JSON格式存储)',
`task_status` varchar(16) DEFAULT 'pending' COMMENT '任务状态(pending|running|completed|failed|paused)',
`start_time` bigint DEFAULT NULL COMMENT '任务开始时间戳',
`end_time` bigint DEFAULT NULL COMMENT '任务结束时间戳',
`total_crawled` int DEFAULT 0 COMMENT '已爬取内容数量',
`success_count` int DEFAULT 0 COMMENT '成功爬取数量',
`error_count` int DEFAULT 0 COMMENT '错误数量',
`error_message` text COMMENT '错误信息',
`config_params` text COMMENT '爬取配置参数(JSON格式)',
`scheduled_date` date NOT NULL COMMENT '计划执行日期',
`add_ts` bigint NOT NULL COMMENT '记录添加时间戳',
`last_modify_ts` bigint NOT NULL COMMENT '记录最后修改时间戳',
PRIMARY KEY (`id`),
UNIQUE KEY `idx_crawling_tasks_unique` (`task_id`),
KEY `idx_crawling_tasks_topic` (`topic_id`),
KEY `idx_crawling_tasks_platform` (`platform`),
KEY `idx_crawling_tasks_status` (`task_status`),
KEY `idx_crawling_tasks_date` (`scheduled_date`),
FOREIGN KEY (`topic_id`) REFERENCES `daily_topics`(`topic_id`) ON DELETE CASCADE
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci COMMENT='爬取任务表';
-- ===============================
-- MediaCrawler表结构扩展字段
-- ===============================
-- 为MediaCrawler现有表添加话题关联字段,支持MindSpider功能
-- 注意:这些字段是可选的,不影响MediaCrawler原有功能
-- 为小红书笔记表添加话题关联字段
ALTER TABLE `xhs_note`
ADD COLUMN `topic_id` varchar(64) DEFAULT NULL COMMENT '关联的话题ID',
ADD COLUMN `crawling_task_id` varchar(64) DEFAULT NULL COMMENT '关联的爬取任务ID';
-- 为抖音视频表添加话题关联字段
ALTER TABLE `douyin_aweme`
ADD COLUMN `topic_id` varchar(64) DEFAULT NULL COMMENT '关联的话题ID',
ADD COLUMN `crawling_task_id` varchar(64) DEFAULT NULL COMMENT '关联的爬取任务ID';
-- 为快手视频表添加话题关联字段
ALTER TABLE `kuaishou_video`
ADD COLUMN `topic_id` varchar(64) DEFAULT NULL COMMENT '关联的话题ID',
ADD COLUMN `crawling_task_id` varchar(64) DEFAULT NULL COMMENT '关联的爬取任务ID';
-- 为B站视频表添加话题关联字段
ALTER TABLE `bilibili_video`
ADD COLUMN `topic_id` varchar(64) DEFAULT NULL COMMENT '关联的话题ID',
ADD COLUMN `crawling_task_id` varchar(64) DEFAULT NULL COMMENT '关联的爬取任务ID';
-- 为微博帖子表添加话题关联字段
ALTER TABLE `weibo_note`
ADD COLUMN `topic_id` varchar(64) DEFAULT NULL COMMENT '关联的话题ID',
ADD COLUMN `crawling_task_id` varchar(64) DEFAULT NULL COMMENT '关联的爬取任务ID';
-- 为贴吧帖子表添加话题关联字段
ALTER TABLE `tieba_note`
ADD COLUMN `topic_id` varchar(64) DEFAULT NULL COMMENT '关联的话题ID',
ADD COLUMN `crawling_task_id` varchar(64) DEFAULT NULL COMMENT '关联的爬取任务ID';
-- 为知乎内容表添加话题关联字段
ALTER TABLE `zhihu_content`
ADD COLUMN `topic_id` varchar(64) DEFAULT NULL COMMENT '关联的话题ID',
ADD COLUMN `crawling_task_id` varchar(64) DEFAULT NULL COMMENT '关联的爬取任务ID';
-- ===============================
-- 创建视图用于数据分析
-- ===============================
-- 话题爬取统计视图
CREATE OR REPLACE VIEW `v_topic_crawling_stats` AS
SELECT
dt.topic_id,
dt.topic_name,
dt.extract_date,
dt.processing_status,
COUNT(DISTINCT ct.task_id) as total_tasks,
SUM(CASE WHEN ct.task_status = 'completed' THEN 1 ELSE 0 END) as completed_tasks,
SUM(CASE WHEN ct.task_status = 'failed' THEN 1 ELSE 0 END) as failed_tasks,
SUM(ct.total_crawled) as total_content_crawled,
SUM(ct.success_count) as total_success_count,
SUM(ct.error_count) as total_error_count
FROM daily_topics dt
LEFT JOIN crawling_tasks ct ON dt.topic_id = ct.topic_id
GROUP BY dt.topic_id, dt.topic_name, dt.extract_date, dt.processing_status;
-- 每日数据统计视图
CREATE OR REPLACE VIEW `v_daily_summary` AS
SELECT
crawl_date,
COUNT(DISTINCT news_id) as total_news,
COUNT(DISTINCT source_platform) as platforms_covered,
(SELECT COUNT(*) FROM daily_topics WHERE extract_date = dn.crawl_date) as topics_extracted,
(SELECT COUNT(*) FROM crawling_tasks WHERE scheduled_date = dn.crawl_date) as tasks_created
FROM daily_news dn
GROUP BY crawl_date
ORDER BY crawl_date DESC;
-- ===============================
-- 初始化索引优化
-- ===============================
-- 为关联查询优化添加复合索引
CREATE INDEX `idx_topic_date_status` ON `daily_topics` (`extract_date`, `processing_status`);
CREATE INDEX `idx_task_topic_platform` ON `crawling_tasks` (`topic_id`, `platform`, `task_status`);
CREATE INDEX `idx_news_date_platform` ON `daily_news` (`crawl_date`, `source_platform`);
-- ===============================
-- 数据库配置优化建议
-- ===============================
-- 设置合适的字符集和排序规则
-- ALTER DATABASE mindspider CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci;
-- 建议的数据保留策略(可选)
-- 可以根据需要创建事件调度器来清理历史数据
-- 例如:删除90天前的新闻数据,保留话题和爬取结果数据