Toggle navigation
Toggle navigation
This project
Loading...
Sign in
万朱浩
/
Venue-Ops
Go to a project
Toggle navigation
Projects
Groups
Snippets
Help
Toggle navigation pinning
Project
Activity
Repository
Pipelines
Graphs
Issues
0
Merge Requests
0
Wiki
Network
Create a new issue
Builds
Commits
Authored by
666ghj
2025-11-03 16:36:32 +0800
Browse Files
Options
Browse Files
Download
Plain Diff
Commit
4a49a97d42e7a6f97ca808cfca5b6d520c48f7b5
4a49a97d
2 parents
087f32f6
5b125ea9
Merge branch 'main' of
https://github.com/666ghj/Weibo_PublicOpinion_AnalysisSystem
Show whitespace changes
Inline
Side-by-side
Showing
8 changed files
with
64 additions
and
48 deletions
.gitignore
MindSpider/BroadTopicExtraction/get_today_news.py
MindSpider/DeepSentimentCrawling/MediaCrawler/schema/tables.sql
MindSpider/config.py → MindSpider/config.py.example
README-EN.md
README.md
config.py → config.py.example
requirements.txt
.gitignore
View file @
4a49a97
...
...
@@ -295,6 +295,8 @@ secrets.json
*.key
*.pem
*.crt
config.py
MindSpider/config.py
# API 密钥
api_keys.txt
...
...
MindSpider/BroadTopicExtraction/get_today_news.py
View file @
4a49a97
...
...
@@ -12,6 +12,7 @@ import json
from
datetime
import
datetime
,
date
from
pathlib
import
Path
from
typing
import
List
,
Dict
,
Optional
from
loguru
import
logger
# 添加项目根目录到路径
project_root
=
Path
(
__file__
)
.
parent
.
parent
...
...
@@ -38,8 +39,7 @@ SOURCE_NAMES = {
"wallstreetcn"
:
"华尔街见闻"
,
"thepaper"
:
"澎湃新闻"
,
"cls-hot"
:
"财联社"
,
"xueqiu"
:
"雪球热榜"
,
"kuaishou"
:
"快手热榜"
"xueqiu"
:
"雪球热榜"
}
class
NewsCollector
:
...
...
@@ -72,15 +72,25 @@ class NewsCollector:
async
def
fetch_news
(
self
,
source
:
str
)
->
dict
:
"""从指定源获取最新新闻"""
url
=
f
"{BASE_URL}/api/s?id={source}&latest"
headers
=
{
"Accept"
:
"application/json"
}
headers
=
{
"Accept"
:
"application/json, text/plain, */*"
,
"Accept-Language"
:
"zh-CN,zh;q=0.9,en;q=0.8"
,
"User-Agent"
:
(
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
"AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/124.0.0.0 Safari/537.36"
),
"Referer"
:
BASE_URL
,
"Connection"
:
"keep-alive"
,
}
try
:
async
with
httpx
.
AsyncClient
(
timeout
=
30.0
)
as
client
:
async
with
httpx
.
AsyncClient
(
timeout
=
30.0
,
follow_redirects
=
True
)
as
client
:
response
=
await
client
.
get
(
url
,
headers
=
headers
)
response
.
raise_for_status
()
# 解析JSON响应
data
=
json
.
loads
(
response
.
text
)
data
=
response
.
json
(
)
return
{
"source"
:
source
,
"status"
:
"success"
,
...
...
@@ -91,21 +101,21 @@ class NewsCollector:
return
{
"source"
:
source
,
"status"
:
"timeout"
,
"error"
:
"请求超时
"
,
"error"
:
f
"请求超时: {source}({url})
"
,
"timestamp"
:
datetime
.
now
()
.
isoformat
()
}
except
httpx
.
HTTPStatusError
as
e
:
return
{
"source"
:
source
,
"status"
:
"http_error"
,
"error"
:
f
"HTTP错误: {e.response.status_code}"
,
"error"
:
f
"HTTP错误: {
source}({url}) - {
e.response.status_code}"
,
"timestamp"
:
datetime
.
now
()
.
isoformat
()
}
except
Exception
as
e
:
return
{
"source"
:
source
,
"status"
:
"error"
,
"error"
:
f
"未知错误: {str(e)}"
,
"error"
:
f
"未知错误: {s
ource}({url}) - {s
tr(e)}"
,
"timestamp"
:
datetime
.
now
()
.
isoformat
()
}
...
...
@@ -114,13 +124,13 @@ class NewsCollector:
if
sources
is
None
:
sources
=
list
(
SOURCE_NAMES
.
keys
())
print
(
f
"正在获取 {len(sources)} 个新闻源的最新内容..."
)
print
(
"="
*
80
)
logger
.
info
(
f
"正在获取 {len(sources)} 个新闻源的最新内容..."
)
logger
.
info
(
"="
*
80
)
results
=
[]
for
source
in
sources
:
source_name
=
SOURCE_NAMES
.
get
(
source
,
source
)
print
(
f
"正在获取 {source_name} 的新闻..."
)
logger
.
info
(
f
"正在获取 {source_name} 的新闻..."
)
result
=
await
self
.
fetch_news
(
source
)
results
.
append
(
result
)
...
...
@@ -128,11 +138,11 @@ class NewsCollector:
data
=
result
[
"data"
]
if
'items'
in
data
and
isinstance
(
data
[
'items'
],
list
):
count
=
len
(
data
[
'items'
])
print
(
f
"✓ {source_name}: 获取成功,共 {count} 条新闻"
)
logger
.
info
(
f
"✓ {source_name}: 获取成功,共 {count} 条新闻"
)
else
:
print
(
f
"✓ {source_name}: 获取成功"
)
logger
.
info
(
f
"✓ {source_name}: 获取成功"
)
else
:
print
(
f
"✗ {source_name}: {result.get('error', '获取失败')}"
)
logger
.
error
(
f
"✗ {source_name}: {result.get('error', '获取失败')}"
)
# 避免请求过快
await
asyncio
.
sleep
(
0.5
)
...
...
@@ -151,18 +161,21 @@ class NewsCollector:
Returns:
包含收集结果的字典
"""
print
(
f
"开始收集每日热点新闻..."
)
print
(
f
"时间: {datetime.now().strftime('
%
Y-
%
m-
%
d
%
H:
%
M:
%
S')}"
)
collection_summary_message
=
""
collection_summary_message
+=
"
\n
开始收集每日热点新闻...
\n
"
collection_summary_message
+=
f
"时间: {datetime.now().strftime('
%
Y-
%
m-
%
d
%
H:
%
M:
%
S')}
\n
"
# 选择新闻源
if
sources
is
None
:
# 使用所有支持的新闻源
sources
=
list
(
SOURCE_NAMES
.
keys
())
print
(
f
"将从 {len(sources)} 个新闻源收集数据:"
)
collection_summary_message
+=
f
"将从 {len(sources)} 个新闻源收集数据:
\n
"
for
source
in
sources
:
source_name
=
SOURCE_NAMES
.
get
(
source
,
source
)
print
(
f
" - {source_name}"
)
collection_summary_message
+=
f
" - {source_name}
\n
"
logger
.
info
(
collection_summary_message
)
try
:
# 获取新闻数据
...
...
@@ -185,7 +198,7 @@ class NewsCollector:
return
processed_data
except
Exception
as
e
:
print
(
f
"收集新闻失败: {e}"
)
logger
.
exception
(
f
"收集新闻失败: {e}"
)
return
{
'success'
:
False
,
'error'
:
str
(
e
),
...
...
@@ -255,35 +268,30 @@ class NewsCollector:
}
except
Exception
as
e
:
print
(
f
"处理新闻项失败: {e}"
)
logger
.
exception
(
f
"处理新闻项失败: {e}"
)
return
None
def
_print_collection_summary
(
self
,
data
:
Dict
):
"""打印收集摘要"""
print
(
"
\n
"
+
"="
*
50
)
print
(
"新闻收集摘要"
)
print
(
"="
*
50
)
print
(
f
"总新闻源: {data['total_sources']}"
)
print
(
f
"成功源数: {data['successful_sources']}"
)
print
(
f
"总新闻数: {data['total_news']}"
)
collection_summary_message
=
""
collection_summary_message
+=
f
"
\n
总新闻源: {data['total_sources']}
\n
"
collection_summary_message
+=
f
"成功源数: {data['successful_sources']}
\n
"
collection_summary_message
+=
f
"总新闻数: {data['total_news']}
\n
"
if
'saved_count'
in
data
:
print
(
f
"已保存数: {data['saved_count']}"
)
print
(
"="
*
50
)
collection_summary_message
+=
f
"已保存数: {data['saved_count']}
\n
"
logger
.
info
(
collection_summary_message
)
def
get_today_news
(
self
)
->
List
[
Dict
]:
"""获取今天的新闻"""
try
:
return
self
.
db_manager
.
get_daily_news
(
date
.
today
())
except
Exception
as
e
:
print
(
f
"获取今日新闻失败: {e}"
)
logger
.
exception
(
f
"获取今日新闻失败: {e}"
)
return
[]
async
def
main
():
"""测试新闻收集器"""
print
(
"测试新闻收集器..."
)
logger
.
info
(
"测试新闻收集器..."
)
async
with
NewsCollector
()
as
collector
:
# 收集新闻
...
...
@@ -292,9 +300,9 @@ async def main():
)
if
result
[
'success'
]:
print
(
f
"收集成功!共获取 {result['total_news']} 条新闻"
)
logger
.
info
(
f
"收集成功!共获取 {result['total_news']} 条新闻"
)
else
:
print
(
f
"收集失败: {result.get('error', '未知错误')}"
)
logger
.
error
(
f
"收集失败: {result.get('error', '未知错误')}"
)
if
__name__
==
"__main__"
:
asyncio
.
run
(
main
())
...
...
MindSpider/DeepSentimentCrawling/MediaCrawler/schema/tables.sql
View file @
4a49a97
...
...
@@ -455,19 +455,12 @@ CREATE TABLE tieba_comment
KEY
`idx_tieba_comment_publish_time`
(
`publish_time`
)
)
ENGINE
=
InnoDB
AUTO_INCREMENT
=
1
DEFAULT
CHARSET
=
utf8mb4
COLLATE
=
utf8mb4_0900_ai_ci
COMMENT
=
'贴吧评论表'
;
-- 增加搜索来源关键字字段
alter
table
bilibili_video
add
column
`source_keyword`
varchar
(
255
)
default
''
comment
'搜索来源关键字'
;
alter
table
douyin_aweme
add
column
`source_keyword`
varchar
(
255
)
default
''
comment
'搜索来源关键字'
;
alter
table
kuaishou_video
add
column
`source_keyword`
varchar
(
255
)
default
''
comment
'搜索来源关键字'
;
alter
table
weibo_note
add
column
`source_keyword`
varchar
(
255
)
default
''
comment
'搜索来源关键字'
;
alter
table
xhs_note
add
column
`source_keyword`
varchar
(
255
)
default
''
comment
'搜索来源关键字'
;
alter
table
tieba_note
add
column
`source_keyword`
varchar
(
255
)
default
''
comment
'搜索来源关键字'
;
alter
table
bilibili_video
add
column
`source_keyword`
varchar
(
255
)
default
''
comment
'搜索来源关键字'
;
alter
table
douyin_aweme
add
column
`source_keyword`
varchar
(
255
)
default
''
comment
'搜索来源关键字'
;
alter
table
kuaishou_video
add
column
`source_keyword`
varchar
(
255
)
default
''
comment
'搜索来源关键字'
;
alter
table
weibo_note
add
column
`source_keyword`
varchar
(
255
)
default
''
comment
'搜索来源关键字'
;
alter
table
xhs_note
add
column
`source_keyword`
varchar
(
255
)
default
''
comment
'搜索来源关键字'
;
alter
table
tieba_note
add
column
`source_keyword`
varchar
(
255
)
default
''
comment
'搜索来源关键字'
;
DROP
TABLE
IF
EXISTS
`weibo_creator`
;
...
...
MindSpider/config.py → MindSpider/config.py
.example
View file @
4a49a97
README-EN.md
View file @
4a49a97
...
...
@@ -218,6 +218,8 @@ playwright install chromium
#### 4.1 Configure API Keys
Copy the
`config.py.example` file to `config.py`
Edit the
`config.py`
file and fill in your API keys (you can also choose your own models and search proxies; see the config file for details):
```
python
...
...
@@ -243,6 +245,9 @@ INSIGHT_ENGINE_MODEL_NAME = "kimi-k2-0711-preview"
#### 4.2 Database Initialization
**Option 1: Use Local Database**
You can refer to
`MindSpider\config.py\config.py.example` for the configuration template, and you can copy this file and rename it to `config.py`
.
```
bash
# Local MySQL database initialization
cd
MindSpider
...
...
README.md
View file @
4a49a97
...
...
@@ -21,6 +21,9 @@
</div>
> [!IMPORTANT]
> 周一(11.3)会上**在线一键部署体验**,欢迎持续关注!
## ⚡ 项目概述
“
**微舆**
” 是一个从0实现的创新型 多智能体 舆情分析系统,帮助大家破除信息茧房,还原舆情原貌,预测未来走向,辅助决策。用户只需像聊天一样提出分析需求,智能体开始全自动分析 国内外30+主流社媒 与 数百万条大众评论。
...
...
@@ -220,6 +223,8 @@ playwright install chromium
#### 4.1 配置API密钥
复制一份
`config.py.example` 文件,命名为 `config.py`
编辑
`config.py`
文件,填入您的API密钥(您也可以选择自己的模型、搜索代理,详情见config文件内):
```
python
...
...
@@ -248,6 +253,8 @@ INSIGHT_ENGINE_MODEL_NAME = "kimi-k2-0711-preview"
> MindSpider爬虫系统跟舆情系统是各自独立的,所以需要再去`MindSpider\config.py`配置一下
配置模板可以参考
`MindSpider\config.py\config.py.example`,可以复制该文件并命名为`config.py`
```
bash
# 本地MySQL数据库初始化
cd
MindSpider
...
...
config.py → config.py
.example
View file @
4a49a97
requirements.txt
View file @
4a49a97
...
...
@@ -73,3 +73,4 @@ flake8>=6.0.0
# ===== Web服务器 =====
fastapi==0.110.2
uvicorn==0.29.0
loguru
\ No newline at end of file
...
...
Please
register
or
login
to post a comment