mindspider_tables.sql 9.63 KB
-- MindSpider AI爬虫项目 - 数据库表结构
-- 基于MediaCrawler表结构扩展,添加BroadTopicExtraction模块所需表

-- ===============================
-- BroadTopicExtraction 模块表结构
-- ===============================

-- ----------------------------
-- Table structure for daily_news
-- 每日新闻表:存储get_today_news.py获取的热点新闻
-- ----------------------------
DROP TABLE IF EXISTS `daily_news`;
CREATE TABLE `daily_news` (
    `id` int NOT NULL AUTO_INCREMENT COMMENT '自增ID',
    `news_id` varchar(128) NOT NULL COMMENT '新闻唯一ID',
    `source_platform` varchar(32) NOT NULL COMMENT '新闻源平台(weibo|zhihu|bilibili|toutiao|douyin等)',
    `title` varchar(500) NOT NULL COMMENT '新闻标题',
    `url` varchar(512) DEFAULT NULL COMMENT '新闻链接',
    `description` text COMMENT '新闻描述或摘要',
    `extra_info` text COMMENT '额外信息(JSON格式存储)',
    `crawl_date` date NOT NULL COMMENT '爬取日期',
    `rank_position` int DEFAULT NULL COMMENT '在热榜中的排名位置',
    `add_ts` bigint NOT NULL COMMENT '记录添加时间戳',
    `last_modify_ts` bigint NOT NULL COMMENT '记录最后修改时间戳',
    PRIMARY KEY (`id`),
    UNIQUE KEY `idx_daily_news_unique` (`news_id`, `source_platform`, `crawl_date`),
    KEY `idx_daily_news_date` (`crawl_date`),
    KEY `idx_daily_news_platform` (`source_platform`),
    KEY `idx_daily_news_rank` (`rank_position`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci COMMENT='每日热点新闻表';

-- ----------------------------
-- Table structure for daily_topics
-- 每日话题表:存储TopicGPT提取的话题信息
-- ----------------------------
DROP TABLE IF EXISTS `daily_topics`;
CREATE TABLE `daily_topics` (
    `id` int NOT NULL AUTO_INCREMENT COMMENT '自增ID',
    `topic_id` varchar(64) NOT NULL COMMENT '话题唯一ID',
    `topic_name` varchar(255) NOT NULL COMMENT '话题名称',
    `topic_description` text COMMENT '话题描述',
    `keywords` text COMMENT '话题关键词(JSON格式存储)',
    `extract_date` date NOT NULL COMMENT '话题提取日期',
    `relevance_score` float DEFAULT NULL COMMENT '话题相关性得分',
    `news_count` int DEFAULT 0 COMMENT '关联的新闻数量',
    `processing_status` varchar(16) DEFAULT 'pending' COMMENT '处理状态(pending|processing|completed|failed)',
    `add_ts` bigint NOT NULL COMMENT '记录添加时间戳',
    `last_modify_ts` bigint NOT NULL COMMENT '记录最后修改时间戳',
    PRIMARY KEY (`id`),
    UNIQUE KEY `idx_daily_topics_unique` (`topic_id`, `extract_date`),
    KEY `idx_daily_topics_date` (`extract_date`),
    KEY `idx_daily_topics_status` (`processing_status`),
    KEY `idx_daily_topics_score` (`relevance_score`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci COMMENT='每日提取话题表';

-- ----------------------------
-- Table structure for topic_news_relation
-- 话题新闻关联表:记录话题和新闻的关联关系
-- ----------------------------
DROP TABLE IF EXISTS `topic_news_relation`;
CREATE TABLE `topic_news_relation` (
    `id` int NOT NULL AUTO_INCREMENT COMMENT '自增ID',
    `topic_id` varchar(64) NOT NULL COMMENT '话题ID',
    `news_id` varchar(128) NOT NULL COMMENT '新闻ID',
    `relation_score` float DEFAULT NULL COMMENT '关联度得分',
    `extract_date` date NOT NULL COMMENT '关联提取日期',
    `add_ts` bigint NOT NULL COMMENT '记录添加时间戳',
    PRIMARY KEY (`id`),
    UNIQUE KEY `idx_topic_news_unique` (`topic_id`, `news_id`, `extract_date`),
    KEY `idx_topic_news_topic` (`topic_id`),
    KEY `idx_topic_news_news` (`news_id`),
    KEY `idx_topic_news_date` (`extract_date`),
    FOREIGN KEY (`topic_id`) REFERENCES `daily_topics`(`topic_id`) ON DELETE CASCADE,
    FOREIGN KEY (`news_id`) REFERENCES `daily_news`(`news_id`) ON DELETE CASCADE
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci COMMENT='话题新闻关联表';

-- ----------------------------
-- Table structure for crawling_tasks
-- 爬取任务表:记录基于话题的平台爬取任务
-- ----------------------------
DROP TABLE IF EXISTS `crawling_tasks`;
CREATE TABLE `crawling_tasks` (
    `id` int NOT NULL AUTO_INCREMENT COMMENT '自增ID',
    `task_id` varchar(64) NOT NULL COMMENT '任务唯一ID',
    `topic_id` varchar(64) NOT NULL COMMENT '关联的话题ID',
    `platform` varchar(32) NOT NULL COMMENT '目标平台(xhs|dy|ks|bili|wb|tieba|zhihu)',
    `search_keywords` text NOT NULL COMMENT '搜索关键词(JSON格式存储)',
    `task_status` varchar(16) DEFAULT 'pending' COMMENT '任务状态(pending|running|completed|failed|paused)',
    `start_time` bigint DEFAULT NULL COMMENT '任务开始时间戳',
    `end_time` bigint DEFAULT NULL COMMENT '任务结束时间戳',
    `total_crawled` int DEFAULT 0 COMMENT '已爬取内容数量',
    `success_count` int DEFAULT 0 COMMENT '成功爬取数量',
    `error_count` int DEFAULT 0 COMMENT '错误数量',
    `error_message` text COMMENT '错误信息',
    `config_params` text COMMENT '爬取配置参数(JSON格式)',
    `scheduled_date` date NOT NULL COMMENT '计划执行日期',
    `add_ts` bigint NOT NULL COMMENT '记录添加时间戳',
    `last_modify_ts` bigint NOT NULL COMMENT '记录最后修改时间戳',
    PRIMARY KEY (`id`),
    UNIQUE KEY `idx_crawling_tasks_unique` (`task_id`),
    KEY `idx_crawling_tasks_topic` (`topic_id`),
    KEY `idx_crawling_tasks_platform` (`platform`),
    KEY `idx_crawling_tasks_status` (`task_status`),
    KEY `idx_crawling_tasks_date` (`scheduled_date`),
    FOREIGN KEY (`topic_id`) REFERENCES `daily_topics`(`topic_id`) ON DELETE CASCADE
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci COMMENT='爬取任务表';

-- ===============================
-- MediaCrawler表结构扩展字段
-- ===============================

-- 为MediaCrawler现有表添加话题关联字段,支持MindSpider功能
-- 注意:这些字段是可选的,不影响MediaCrawler原有功能

-- 为小红书笔记表添加话题关联字段
ALTER TABLE `xhs_note` 
ADD COLUMN `topic_id` varchar(64) DEFAULT NULL COMMENT '关联的话题ID',
ADD COLUMN `crawling_task_id` varchar(64) DEFAULT NULL COMMENT '关联的爬取任务ID';

-- 为抖音视频表添加话题关联字段
ALTER TABLE `douyin_aweme` 
ADD COLUMN `topic_id` varchar(64) DEFAULT NULL COMMENT '关联的话题ID',
ADD COLUMN `crawling_task_id` varchar(64) DEFAULT NULL COMMENT '关联的爬取任务ID';

-- 为快手视频表添加话题关联字段
ALTER TABLE `kuaishou_video` 
ADD COLUMN `topic_id` varchar(64) DEFAULT NULL COMMENT '关联的话题ID',
ADD COLUMN `crawling_task_id` varchar(64) DEFAULT NULL COMMENT '关联的爬取任务ID';

-- 为B站视频表添加话题关联字段
ALTER TABLE `bilibili_video` 
ADD COLUMN `topic_id` varchar(64) DEFAULT NULL COMMENT '关联的话题ID',
ADD COLUMN `crawling_task_id` varchar(64) DEFAULT NULL COMMENT '关联的爬取任务ID';

-- 为微博帖子表添加话题关联字段
ALTER TABLE `weibo_note` 
ADD COLUMN `topic_id` varchar(64) DEFAULT NULL COMMENT '关联的话题ID',
ADD COLUMN `crawling_task_id` varchar(64) DEFAULT NULL COMMENT '关联的爬取任务ID';

-- 为贴吧帖子表添加话题关联字段
ALTER TABLE `tieba_note` 
ADD COLUMN `topic_id` varchar(64) DEFAULT NULL COMMENT '关联的话题ID',
ADD COLUMN `crawling_task_id` varchar(64) DEFAULT NULL COMMENT '关联的爬取任务ID';

-- 为知乎内容表添加话题关联字段
ALTER TABLE `zhihu_content` 
ADD COLUMN `topic_id` varchar(64) DEFAULT NULL COMMENT '关联的话题ID',
ADD COLUMN `crawling_task_id` varchar(64) DEFAULT NULL COMMENT '关联的爬取任务ID';

-- ===============================
-- 创建视图用于数据分析
-- ===============================

-- 话题爬取统计视图
CREATE OR REPLACE VIEW `v_topic_crawling_stats` AS
SELECT 
    dt.topic_id,
    dt.topic_name,
    dt.extract_date,
    dt.processing_status,
    COUNT(DISTINCT ct.task_id) as total_tasks,
    SUM(CASE WHEN ct.task_status = 'completed' THEN 1 ELSE 0 END) as completed_tasks,
    SUM(CASE WHEN ct.task_status = 'failed' THEN 1 ELSE 0 END) as failed_tasks,
    SUM(ct.total_crawled) as total_content_crawled,
    SUM(ct.success_count) as total_success_count,
    SUM(ct.error_count) as total_error_count
FROM daily_topics dt
LEFT JOIN crawling_tasks ct ON dt.topic_id = ct.topic_id
GROUP BY dt.topic_id, dt.topic_name, dt.extract_date, dt.processing_status;

-- 每日数据统计视图
CREATE OR REPLACE VIEW `v_daily_summary` AS
SELECT 
    crawl_date,
    COUNT(DISTINCT news_id) as total_news,
    COUNT(DISTINCT source_platform) as platforms_covered,
    (SELECT COUNT(*) FROM daily_topics WHERE extract_date = dn.crawl_date) as topics_extracted,
    (SELECT COUNT(*) FROM crawling_tasks WHERE scheduled_date = dn.crawl_date) as tasks_created
FROM daily_news dn
GROUP BY crawl_date
ORDER BY crawl_date DESC;

-- ===============================
-- 初始化索引优化
-- ===============================

-- 为关联查询优化添加复合索引
CREATE INDEX `idx_topic_date_status` ON `daily_topics` (`extract_date`, `processing_status`);
CREATE INDEX `idx_task_topic_platform` ON `crawling_tasks` (`topic_id`, `platform`, `task_status`);
CREATE INDEX `idx_news_date_platform` ON `daily_news` (`crawl_date`, `source_platform`);

-- ===============================
-- 数据库配置优化建议
-- ===============================

-- 设置合适的字符集和排序规则
-- ALTER DATABASE mindspider CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci;

-- 建议的数据保留策略(可选)
-- 可以根据需要创建事件调度器来清理历史数据
-- 例如:删除90天前的新闻数据,保留话题和爬取结果数据