Toggle navigation
Toggle navigation
This project
Loading...
Sign in
万朱浩
/
Venue-Ops
Go to a project
Toggle navigation
Projects
Groups
Snippets
Help
Toggle navigation pinning
Project
Activity
Repository
Pipelines
Graphs
Issues
0
Merge Requests
0
Wiki
Network
Create a new issue
Builds
Commits
Authored by
Doiiars
2026-01-12 17:03:14 +0800
Browse Files
Options
Browse Files
Download
Email Patches
Plain Diff
Commit
947879fe705add42239e26162d0c922997e81542
947879fe
1 parent
dc1382d5
feat: migrate MediaCrawler to git submodule and enhance MindSpider automation
Hide whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
130 additions
and
27 deletions
.gitignore
MindSpider/DeepSentimentCrawling/platform_crawler.py
MindSpider/README.md
MindSpider/main.py
requirements.txt
.gitignore
View file @
947879f
...
...
@@ -340,6 +340,7 @@ test_results/
# Ai操作指引文件
OperationGuidance/
db_data/
insight_engine_streamlit_reports/
media_engine_streamlit_reports/
query_engine_streamlit_reports/
...
...
MindSpider/DeepSentimentCrawling/platform_crawler.py
View file @
947879f
...
...
@@ -107,19 +107,34 @@ sqlite_db_config = {{
"db_path": SQLITE_DB_PATH
}}
# postgresql config - 使用MindSpider的数据库配置(如果DB_DIALECT是postgresql)或环境变量
POSTGRESQL_DB_PWD = os.getenv("POSTGRESQL_DB_PWD", "{pg_password}")
POSTGRESQL_DB_USER = os.getenv("POSTGRESQL_DB_USER", "{pg_user}")
POSTGRESQL_DB_HOST = os.getenv("POSTGRESQL_DB_HOST", "{pg_host}")
POSTGRESQL_DB_PORT = os.getenv("POSTGRESQL_DB_PORT", "{pg_port}")
POSTGRESQL_DB_NAME = os.getenv("POSTGRESQL_DB_NAME", "{pg_db_name}")
# mongodb config
MONGODB_HOST = os.getenv("MONGODB_HOST", "localhost")
MONGODB_PORT = os.getenv("MONGODB_PORT", 27017)
MONGODB_USER = os.getenv("MONGODB_USER", "")
MONGODB_PWD = os.getenv("MONGODB_PWD", "")
MONGODB_DB_NAME = os.getenv("MONGODB_DB_NAME", "media_crawler")
postgresql_db_config = {{
"user": POSTGRESQL_DB_USER,
"password": POSTGRESQL_DB_PWD,
"host": POSTGRESQL_DB_HOST,
"port": POSTGRESQL_DB_PORT,
"db_name": POSTGRESQL_DB_NAME,
mongodb_config = {{
"host": MONGODB_HOST,
"port": int(MONGODB_PORT),
"user": MONGODB_USER,
"password": MONGODB_PWD,
"db_name": MONGODB_DB_NAME,
}}
# postgres config - 使用MindSpider的数据库配置(如果DB_DIALECT是postgresql)或环境变量
POSTGRES_DB_PWD = os.getenv("POSTGRES_DB_PWD", "{pg_password}")
POSTGRES_DB_USER = os.getenv("POSTGRES_DB_USER", "{pg_user}")
POSTGRES_DB_HOST = os.getenv("POSTGRES_DB_HOST", "{pg_host}")
POSTGRES_DB_PORT = os.getenv("POSTGRES_DB_PORT", "{pg_port}")
POSTGRES_DB_NAME = os.getenv("POSTGRES_DB_NAME", "{pg_db_name}")
postgres_db_config = {{
"user": POSTGRES_DB_USER,
"password": POSTGRES_DB_PWD,
"host": POSTGRES_DB_HOST,
"port": POSTGRES_DB_PORT,
"db_name": POSTGRES_DB_NAME,
}}
'''
...
...
@@ -154,7 +169,7 @@ postgresql_db_config = {{
# 判断数据库类型,确定 SAVE_DATA_OPTION
db_dialect
=
(
config
.
settings
.
DB_DIALECT
or
"mysql"
)
.
lower
()
is_postgresql
=
db_dialect
in
(
"postgresql"
,
"postgres"
)
save_data_option
=
"postgres
ql
"
if
is_postgresql
else
"db"
save_data_option
=
"postgres"
if
is_postgresql
else
"db"
base_config_path
=
self
.
mediacrawler_path
/
"config"
/
"base_config.py"
...
...
@@ -238,7 +253,7 @@ postgresql_db_config = {{
# 判断数据库类型,确定 save_data_option
db_dialect
=
(
config
.
settings
.
DB_DIALECT
or
"mysql"
)
.
lower
()
is_postgresql
=
db_dialect
in
(
"postgresql"
,
"postgres"
)
save_data_option
=
"postgres
ql
"
if
is_postgresql
else
"db"
save_data_option
=
"postgres"
if
is_postgresql
else
"db"
# 构建命令
cmd
=
[
...
...
@@ -401,7 +416,7 @@ postgresql_db_config = {{
total_stats
[
"keyword_results"
][
keyword
]
=
{}
total_stats
[
"keyword_results"
][
keyword
][
platform
]
=
result
logger
.
info
(
f
" ✅
成功: {notes_count} 条内容, {comments_count} 条评论
"
)
logger
.
info
(
f
" ✅
爬取成功
"
)
else
:
total_stats
[
"failed_tasks"
]
+=
len
(
keywords
)
total_stats
[
"platform_summary"
][
platform
][
"failed_keywords"
]
=
len
(
keywords
)
...
...
@@ -433,15 +448,12 @@ postgresql_db_config = {{
finish_message
+=
f
"
\n
成功: {total_stats['successful_tasks']}"
finish_message
+=
f
"
\n
失败: {total_stats['failed_tasks']}"
finish_message
+=
f
"
\n
成功率: {total_stats['successful_tasks']/total_stats['total_tasks']*100:.1f}
%
"
finish_message
+=
f
"
\n
总内容: {total_stats['total_notes']} 条"
finish_message
+=
f
"
\n
总评论: {total_stats['total_comments']} 条"
logger
.
info
(
finish_message
)
platform_summary_message
=
f
"
\n
�
各平台统计:"
platform_summary_message
=
f
"
\n
📈
各平台统计:"
for
platform
,
stats
in
total_stats
[
"platform_summary"
]
.
items
():
success_rate
=
stats
[
"successful_keywords"
]
/
len
(
keywords
)
*
100
if
keywords
else
0
platform_summary_message
+=
f
"
\n
{platform}: {stats['successful_keywords']}/{len(keywords)} 关键词成功 ({success_rate:.1f}
%
), "
platform_summary_message
+=
f
"{stats['total_notes']} 条内容"
platform_summary_message
+=
f
"
\n
{platform}: {stats['successful_keywords']}/{len(keywords)} 关键词成功 ({success_rate:.1f}
%
)"
logger
.
info
(
platform_summary_message
)
return
total_stats
...
...
MindSpider/README.md
View file @
947879f
...
...
@@ -186,8 +186,8 @@ flowchart TB
-
记录任务状态、进度、结果等
5.
**平台内容表**
(继承自MediaCrawler)
-
xhs_note - 小红书笔记(暂时废弃,详情查看:https://github.com/NanmiCoder/MediaCrawler/issues/754)
-
douyin_aweme - 抖音视频
-
xhs_note - 小红书笔记
-
douyin_aweme - 抖音视频
-
kuaishou_video - 快手视频
-
bilibili_video - B站视频
-
weibo_note - 微博帖子
...
...
@@ -204,13 +204,27 @@ flowchart TB
-
操作系统:Windows/Linux/macOS
### 1. 克隆项目
### 1. 克隆项目与获取子模块
MindSpider 作为 BettaFish 的核心组件运行。请克隆 BettaFish 主项目并同步获取
`MediaCrawler`
爬虫子模块。
**方式一:克隆时直接获取(推荐)**
```
bash
git clone --recurse-submodules https://github.com/666ghj/BettaFish.git
cd
BettaFish/MindSpider
```
**方式二:已克隆主项目后补充拉取**
如果你已经克隆了 BettaFish 但
`MindSpider/DeepSentimentCrawling/MediaCrawler`
目录为空,请在
**项目根目录**
运行:
```
bash
git clone https://github.com/yourusername/MindSpider.git
cd
MindSpider
git submodule update --init --recursive
```
> **注意**:MediaCrawler 的 Python 依赖会在首次运行 `python main.py` 时由系统自动检测并静默安装到当前环境。
### 2. 创建并激活环境
#### Conda配置方法
...
...
@@ -316,7 +330,7 @@ python main.py --broad-topic --date 2024-01-15
**首次使用每个平台都需要登录,这是最关键的步骤:**
1.
**小红书登录**
(暂时废弃,详情查看:https://github.com/NanmiCoder/MediaCrawler/issues/754)
1.
**小红书登录**
```
bash
# 测试小红书爬取(会弹出二维码)
python main.py --deep-sentiment --platforms xhs --test
...
...
MindSpider/main.py
View file @
947879f
...
...
@@ -165,6 +165,21 @@ class MindSpider:
logger
.
exception
(
f
"数据库初始化异常: {e}"
)
return
False
def
_ensure_database_ready
(
self
)
->
bool
:
"""确保数据库表已就绪,如不存在则自动初始化"""
if
not
self
.
check_database_connection
():
logger
.
error
(
"数据库连接失败,无法继续"
)
return
False
if
not
self
.
check_database_tables
():
logger
.
warning
(
"数据库表不存在,自动初始化中..."
)
if
not
self
.
initialize_database
():
logger
.
error
(
"数据库自动初始化失败"
)
return
False
logger
.
info
(
"数据库表自动初始化成功"
)
return
True
def
check_dependencies
(
self
)
->
bool
:
"""检查依赖环境"""
logger
.
info
(
"检查依赖环境..."
)
...
...
@@ -184,19 +199,69 @@ class MindSpider:
logger
.
info
(
"请运行: pip install -r requirements.txt"
)
return
False
# 检查MediaCrawler依赖
# 检查
并安装
MediaCrawler依赖
mediacrawler_path
=
self
.
deep_sentiment_path
/
"MediaCrawler"
if
not
mediacrawler_path
.
exists
():
logger
.
error
(
"错误:找不到MediaCrawler目录"
)
return
False
# 自动安装MediaCrawler的依赖
self
.
_install_mediacrawler_dependencies
()
logger
.
info
(
"依赖环境检查通过"
)
return
True
def
_install_mediacrawler_dependencies
(
self
)
->
bool
:
"""自动安装MediaCrawler子模块的依赖"""
mediacrawler_req
=
self
.
deep_sentiment_path
/
"MediaCrawler"
/
"requirements.txt"
if
not
mediacrawler_req
.
exists
():
logger
.
warning
(
f
"MediaCrawler requirements.txt 不存在: {mediacrawler_req}"
)
return
False
# 检查是否已安装过(使用标记文件)
marker_file
=
self
.
deep_sentiment_path
/
"MediaCrawler"
/
".deps_installed"
req_mtime
=
mediacrawler_req
.
stat
()
.
st_mtime
if
marker_file
.
exists
():
marker_mtime
=
marker_file
.
stat
()
.
st_mtime
if
marker_mtime
>=
req_mtime
:
logger
.
debug
(
"MediaCrawler依赖已安装,跳过"
)
return
True
logger
.
info
(
"正在安装MediaCrawler依赖..."
)
try
:
result
=
subprocess
.
run
(
[
sys
.
executable
,
"-m"
,
"pip"
,
"install"
,
"-r"
,
str
(
mediacrawler_req
),
"-q"
],
capture_output
=
True
,
text
=
True
,
timeout
=
300
# 5分钟超时
)
if
result
.
returncode
==
0
:
# 创建标记文件
marker_file
.
touch
()
logger
.
info
(
"MediaCrawler依赖安装成功"
)
return
True
else
:
logger
.
error
(
f
"MediaCrawler依赖安装失败: {result.stderr}"
)
return
False
except
subprocess
.
TimeoutExpired
:
logger
.
error
(
"MediaCrawler依赖安装超时"
)
return
False
except
Exception
as
e
:
logger
.
exception
(
f
"MediaCrawler依赖安装异常: {e}"
)
return
False
def
run_broad_topic_extraction
(
self
,
extract_date
:
date
=
None
,
keywords_count
:
int
=
100
)
->
bool
:
"""运行BroadTopicExtraction模块"""
logger
.
info
(
"运行BroadTopicExtraction模块..."
)
# 自动检查并初始化数据库表
if
not
self
.
_ensure_database_ready
():
return
False
if
not
extract_date
:
extract_date
=
date
.
today
()
...
...
@@ -234,6 +299,10 @@ class MindSpider:
"""运行DeepSentimentCrawling模块"""
logger
.
info
(
"运行DeepSentimentCrawling模块..."
)
# 自动检查并初始化数据库表
if
not
self
.
_ensure_database_ready
():
return
False
if
not
target_date
:
target_date
=
date
.
today
()
...
...
@@ -282,6 +351,10 @@ class MindSpider:
"""运行完整工作流程"""
logger
.
info
(
"开始完整的MindSpider工作流程"
)
# 自动检查并初始化数据库表(确保独立调用时也能自动初始化)
if
not
self
.
_ensure_database_ready
():
return
False
if
not
target_date
:
target_date
=
date
.
today
()
...
...
requirements.txt
View file @
947879f
...
...
@@ -35,9 +35,11 @@ jieba==0.42.1
pymysql==1.1.0
aiomysql==0.2.0
aiosqlite==0.21.0
motor>=3.3.0
redis>=4.6.0
SQLAlchemy==2.0.35
asyncpg==0.29.0
psycopg[binary]>=3.1.0
cryptography==42.0.7
# ===== 爬虫相关 =====
...
...
@@ -67,6 +69,7 @@ xgboost>=2.0.0
# NOTE:如果要安装GPU版本的torch,指令为pip3 install torch torchvision --index-url https://download.pytorch.org/whl/cu126
# ===== 工具库 =====
typer>=0.9.0
python-dotenv>=1.0.0
python-dateutil>=2.8.2
pytz>=2023.3
...
...
Please
register
or
login
to post a comment