Toggle navigation
Toggle navigation
This project
Loading...
Sign in
万朱浩
/
Venue-Ops
Go to a project
Toggle navigation
Projects
Groups
Snippets
Help
Toggle navigation pinning
Project
Activity
Repository
Pipelines
Graphs
Issues
0
Merge Requests
0
Wiki
Network
Create a new issue
Builds
Commits
Authored by
戒酒的李白
2025-03-04 10:45:37 +0800
Browse Files
Options
Browse Files
Download
Email Patches
Plain Diff
Commit
f85298c02191dcb00a49c5fa6014a298bb9b37ad
f85298c0
1 parent
e0719583
Spider System Performance & Stability Enhancement
Show whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
195 additions
and
151 deletions
spider/spiderData.py
views/spider_control.py
spider/spiderData.py
View file @
f85298c
...
...
@@ -11,6 +11,9 @@ from bs4 import BeautifulSoup
from
datetime
import
datetime
from
utils.logger
import
spider_logger
as
logging
from
utils.db_manager
import
DatabaseManager
from
cachetools
import
TTLCache
,
LRUCache
from
typing
import
List
,
Dict
,
Any
import
pandas
as
pd
def
spiderData
():
if
not
os
.
path
.
exists
(
navAddr
):
...
...
@@ -29,16 +32,69 @@ class SpiderData:
self
.
base_url
=
'https://s.weibo.com'
self
.
db
=
DatabaseManager
()
def
crawl_topic
(
self
,
topic
,
depth
=
3
,
interval
=
5
,
max_retries
=
3
,
timeout
=
30
):
"""
爬取指定话题的微博内容
# 初始化缓存
self
.
data_cache
=
TTLCache
(
maxsize
=
1000
,
ttl
=
3600
)
# 1小时TTL缓存
self
.
html_cache
=
LRUCache
(
maxsize
=
100
)
# LRU缓存最近的100个页面
# 批量插入缓冲区
self
.
insert_buffer
=
[]
self
.
buffer_size
=
50
# 每50条数据批量插入一次
def
_get_cached_page
(
self
,
url
:
str
)
->
str
:
"""获取缓存的页面内容"""
return
self
.
html_cache
.
get
(
url
)
def
_cache_page
(
self
,
url
:
str
,
content
:
str
):
"""缓存页面内容"""
self
.
html_cache
[
url
]
=
content
def
_get_cached_data
(
self
,
key
:
str
)
->
Dict
[
str
,
Any
]:
"""获取缓存的数据"""
return
self
.
data_cache
.
get
(
key
)
def
_cache_data
(
self
,
key
:
str
,
data
:
Dict
[
str
,
Any
]):
"""缓存数据"""
self
.
data_cache
[
key
]
=
data
:param topic: 要爬取的话题
:param depth: 爬取深度(页数)
:param interval: 请求间隔时间(秒)
:param max_retries: 最大重试次数
:param timeout: 请求超时时间(秒)
def
_flush_buffer
(
self
):
"""将缓冲区数据批量插入数据库"""
if
not
self
.
insert_buffer
:
return
try
:
connection
=
self
.
db
.
get_connection
()
with
connection
.
cursor
()
as
cursor
:
# 使用pandas进行高效的批量插入
df
=
pd
.
DataFrame
(
self
.
insert_buffer
)
# 构建批量插入SQL
columns
=
', '
.
join
(
df
.
columns
)
values
=
', '
.
join
([
'
%
s'
]
*
len
(
df
.
columns
))
sql
=
f
"""
INSERT INTO article ({columns})
VALUES ({values})
ON DUPLICATE KEY UPDATE
forward_count = VALUES(forward_count),
comment_count = VALUES(comment_count),
like_count = VALUES(like_count),
crawl_time = VALUES(crawl_time)
"""
# 执行批量插入
cursor
.
executemany
(
sql
,
df
.
values
.
tolist
())
connection
.
commit
()
logging
.
info
(
f
"成功批量插入 {len(self.insert_buffer)} 条数据"
)
self
.
insert_buffer
.
clear
()
except
Exception
as
e
:
logging
.
error
(
f
"批量插入数据失败: {e}"
)
if
connection
:
connection
.
rollback
()
def
crawl_topic
(
self
,
topic
:
str
,
depth
:
int
=
3
,
interval
:
int
=
5
,
max_retries
:
int
=
3
,
timeout
:
int
=
30
):
"""爬取指定话题的微博内容"""
# 参数验证
if
not
isinstance
(
depth
,
int
)
or
depth
<
1
or
depth
>
10
:
raise
ValueError
(
"爬取深度必须在1-10页之间"
)
...
...
@@ -56,9 +112,19 @@ class SpiderData:
while
retries
<
max_retries
:
try
:
url
=
f
"{self.base_url}/weibo?q={topic}&page={page}"
# 检查缓存
cached_content
=
self
.
_get_cached_page
(
url
)
if
cached_content
:
self
.
_parse_page
(
cached_content
)
logging
.
info
(
f
"使用缓存数据: {topic} 第 {page} 页"
)
break
response
=
requests
.
get
(
url
,
headers
=
self
.
headers
,
timeout
=
timeout
)
if
response
.
status_code
==
200
:
# 缓存页面内容
self
.
_cache_page
(
url
,
response
.
text
)
self
.
_parse_page
(
response
.
text
)
logging
.
info
(
f
"成功爬取话题 {topic} 第 {page} 页"
)
break
...
...
@@ -85,12 +151,11 @@ class SpiderData:
logging
.
info
(
f
"等待 {sleep_time:.2f} 秒后继续..."
)
time
.
sleep
(
sleep_time
)
def
_parse_page
(
self
,
html_content
):
"""
解析页面内容并保存数据
# 最后刷新缓冲区
self
.
_flush_buffer
()
:param html_content: 页面HTML内容
"""
def
_parse_page
(
self
,
html_content
:
str
):
"""解析页面内容并保存数据"""
try
:
soup
=
BeautifulSoup
(
html_content
,
'html.parser'
)
weibo_items
=
soup
.
find_all
(
'div'
,
class_
=
'card-wrap'
)
...
...
@@ -124,8 +189,12 @@ class SpiderData:
'crawl_time'
:
datetime
.
now
()
.
strftime
(
'
%
Y-
%
m-
%
d
%
H:
%
M:
%
S'
)
}
# 保存到数据库
self
.
_save_to_database
(
weibo_data
)
# 添加到插入缓冲区
self
.
insert_buffer
.
append
(
weibo_data
)
# 如果缓冲区达到阈值,执行批量插入
if
len
(
self
.
insert_buffer
)
>=
self
.
buffer_size
:
self
.
_flush_buffer
()
except
Exception
as
e
:
logging
.
error
(
f
"解析微博项时出错: {e}"
)
...
...
@@ -134,52 +203,12 @@ class SpiderData:
except
Exception
as
e
:
logging
.
error
(
f
"解析页面时出错: {e}"
)
def
_extract_number
(
self
,
text
):
"""
从文本中提取数字
:param text: 包含数字的文本
:return: 提取的数字,如果没有找到则返回0
"""
def
_extract_number
(
self
,
text
:
str
)
->
int
:
"""从文本中提取数字"""
try
:
return
int
(
''
.
join
(
filter
(
str
.
isdigit
,
text
)))
except
ValueError
:
return
0
def
_save_to_database
(
self
,
data
):
"""
将数据保存到数据库
:param data: 要保存的数据字典
"""
connection
=
None
try
:
connection
=
self
.
db
.
get_connection
()
with
connection
.
cursor
()
as
cursor
:
# 插入文章数据
sql
=
"""
INSERT INTO article (content, user_name, publish_time, forward_count,
comment_count, like_count, crawl_time)
VALUES (
%
s,
%
s,
%
s,
%
s,
%
s,
%
s,
%
s)
"""
cursor
.
execute
(
sql
,
(
data
[
'content'
],
data
[
'user_name'
],
data
[
'publish_time'
],
data
[
'forward_count'
],
data
[
'comment_count'
],
data
[
'like_count'
],
data
[
'crawl_time'
]
))
connection
.
commit
()
logging
.
info
(
f
"成功保存微博数据: {data['content'][:30]}..."
)
except
Exception
as
e
:
logging
.
error
(
f
"保存数据时出错: {e}"
)
if
connection
:
connection
.
rollback
()
if
__name__
==
'__main__'
:
spiderData
()
\ No newline at end of file
...
...
views/spider_control.py
View file @
f85298c
...
...
@@ -10,6 +10,10 @@ import logging
from
spider.spiderData
import
SpiderData
from
openai
import
OpenAI
from
anthropic
import
Anthropic
import
aiohttp
from
concurrent.futures
import
ThreadPoolExecutor
from
ratelimit
import
limits
,
sleep_and_retry
from
tenacity
import
retry
,
stop_after_attempt
,
wait_exponential
# 创建蓝图
spider_bp
=
Blueprint
(
'spider'
,
__name__
)
...
...
@@ -24,124 +28,137 @@ websocket_connections = set()
# 创建消息队列
message_queue
=
Queue
()
# 创建线程池
thread_pool
=
ThreadPoolExecutor
(
max_workers
=
3
)
# 创建异步事件循环
loop
=
asyncio
.
new_event_loop
()
asyncio
.
set_event_loop
(
loop
)
# 默认配置
DEFAULT_CONFIG
=
{
'crawlDepth'
:
3
,
'interval'
:
5
,
'maxRetries'
:
3
,
'timeout'
:
30
'timeout'
:
30
,
'maxConcurrent'
:
2
}
def
load_config
():
"""加载爬虫配置"""
config_path
=
os
.
path
.
join
(
os
.
path
.
dirname
(
__file__
),
'../spider/config.json'
)
try
:
if
os
.
path
.
exists
(
config_path
):
with
open
(
config_path
,
'r'
,
encoding
=
'utf-8'
)
as
f
:
return
json
.
load
(
f
)
except
Exception
as
e
:
logger
.
error
(
f
"加载配置文件失败: {e}"
)
return
DEFAULT_CONFIG
def
save_config
(
config
):
"""保存爬虫配置"""
config_path
=
os
.
path
.
join
(
os
.
path
.
dirname
(
__file__
),
'../spider/config.json'
)
try
:
with
open
(
config_path
,
'w'
,
encoding
=
'utf-8'
)
as
f
:
json
.
dump
(
config
,
f
,
ensure_ascii
=
False
,
indent
=
4
)
return
True
except
Exception
as
e
:
logger
.
error
(
f
"保存配置文件失败: {e}"
)
return
False
async
def
broadcast_message
(
message
):
"""广播消息到所有WebSocket连接"""
if
not
websocket_connections
:
# 限流装饰器
@sleep_and_retry
@limits
(
calls
=
100
,
period
=
60
)
# 每分钟最多100个请求
def
rate_limited_request
():
pass
class
SpiderWorker
:
def
__init__
(
self
,
topics
,
parameters
):
self
.
topics
=
topics
self
.
parameters
=
parameters
self
.
total_topics
=
len
(
topics
)
self
.
completed_topics
=
0
self
.
spider
=
SpiderData
()
self
.
message_buffer
=
[]
self
.
message_buffer_size
=
10
self
.
semaphore
=
asyncio
.
Semaphore
(
parameters
.
get
(
'maxConcurrent'
,
DEFAULT_CONFIG
[
'maxConcurrent'
]))
async
def
send_message
(
self
,
message
):
"""异步发送消息,使用缓冲区优化"""
self
.
message_buffer
.
append
(
message
)
if
len
(
self
.
message_buffer
)
>=
self
.
message_buffer_size
:
await
self
.
flush_messages
()
async
def
flush_messages
(
self
):
"""刷新消息缓冲区"""
if
not
self
.
message_buffer
:
return
for
websocket
in
websocket_connections
.
copy
():
try
:
await
websocket
.
send
(
json
.
dumps
(
message
))
except
websockets
.
exceptions
.
ConnectionClosed
:
websocket_connections
.
remove
(
websocket
)
await
broadcast_message
(
self
.
message_buffer
)
self
.
message_buffer
.
clear
()
except
Exception
as
e
:
logger
.
error
(
f
"发送WebSocket消息失败: {e}"
)
websocket_connections
.
remove
(
websocket
)
def
spider_worker
(
topics
,
parameters
):
"""爬虫工作线程"""
total_topics
=
len
(
topics
)
completed_topics
=
0
async
def
send_message
(
message
):
"""异步发送消息的包装函数"""
loop
=
asyncio
.
new_event_loop
()
asyncio
.
set_event_loop
(
loop
)
try
:
await
broadcast_message
(
message
)
finally
:
loop
.
close
()
logger
.
error
(
f
"发送消息失败: {e}"
)
@retry
(
stop
=
stop_after_attempt
(
3
),
wait
=
wait_exponential
(
multiplier
=
1
,
min
=
4
,
max
=
10
))
async
def
crawl_single_topic
(
self
,
topic
):
"""爬取单个话题"""
try
:
spider
=
SpiderData
()
rate_limited_request
()
for
topic
in
topics
:
try
:
# 更新进度
progress
=
int
((
completed_topics
/
total_topics
)
*
100
)
asyncio
.
run
(
send_message
({
'type'
:
'progress'
,
'value'
:
progress
}))
# 发送开始爬取的日志
asyncio
.
run
(
send_message
({
await
self
.
send_message
({
'type'
:
'log'
,
'message'
:
f
'开始爬取话题: {topic}'
}))
# 执行爬取
spider
.
crawl_topic
(
topic
=
topic
,
depth
=
parameters
[
'crawlDepth'
],
interval
=
parameters
[
'interval'
],
max_retries
=
parameters
[
'maxRetries'
],
timeout
=
parameters
[
'timeout'
]
})
async
with
self
.
semaphore
:
await
asyncio
.
get_event_loop
()
.
run_in_executor
(
thread_pool
,
self
.
spider
.
crawl_topic
,
topic
,
self
.
parameters
[
'crawlDepth'
],
self
.
parameters
[
'interval'
],
self
.
parameters
[
'maxRetries'
],
self
.
parameters
[
'timeout'
]
)
completed_topics
+=
1
self
.
completed_topics
+=
1
progress
=
int
((
self
.
completed_topics
/
self
.
total_topics
)
*
100
)
# 发送完成爬取的日志
asyncio
.
run
(
send_message
({
await
self
.
send_message
({
'type'
:
'progress'
,
'value'
:
progress
})
await
self
.
send_message
({
'type'
:
'log'
,
'message'
:
f
'话题 {topic} 爬取完成'
})
)
}
)
except
Exception
as
e
:
# 发送错误日志
asyncio
.
run
(
send_message
({
logger
.
error
(
f
"爬取话题 {topic} 失败: {e}"
)
await
self
.
send_message
({
'type'
:
'log'
,
'message'
:
f
'爬取话题 {topic} 时出错: {str(e)}'
}))
})
raise
# 更新最终进度
asyncio
.
run
(
send_message
({
'type'
:
'progress'
,
'value'
:
100
}))
async
def
run
(
self
):
"""运行爬虫任务"""
try
:
tasks
=
[
self
.
crawl_single_topic
(
topic
)
for
topic
in
self
.
topics
]
await
asyncio
.
gather
(
*
tasks
)
await
self
.
flush_messages
()
# 发送完成消息
asyncio
.
run
(
send_message
({
await
self
.
send_message
({
'type'
:
'log'
,
'message'
:
'所有话题爬取完成'
})
)
}
)
except
Exception
as
e
:
# 发送错误日志
asyncio
.
run
(
send_message
({
logger
.
error
(
f
"爬虫任务执行出错: {e}"
)
await
self
.
send_message
({
'type'
:
'log'
,
'message'
:
f
'爬虫任务执行出错: {str(e)}'
}))
})
finally
:
await
self
.
flush_messages
()
async
def
broadcast_message
(
messages
):
"""广播消息到所有WebSocket连接"""
if
not
websocket_connections
:
return
for
websocket
in
websocket_connections
.
copy
():
try
:
if
isinstance
(
messages
,
list
):
for
message
in
messages
:
await
websocket
.
send
(
json
.
dumps
(
message
))
else
:
await
websocket
.
send
(
json
.
dumps
(
messages
))
except
websockets
.
exceptions
.
ConnectionClosed
:
websocket_connections
.
remove
(
websocket
)
except
Exception
as
e
:
logger
.
error
(
f
"发送WebSocket消息失败: {e}"
)
websocket_connections
.
remove
(
websocket
)
@spider_bp.route
(
'/spider/control'
)
def
spider_control
():
...
...
@@ -149,12 +166,12 @@ def spider_control():
return
render_template
(
'spider_control.html'
)
@spider_bp.route
(
'/api/spider/start'
,
methods
=
[
'POST'
])
def
start_spider
():
async
def
start_spider
():
"""启动爬虫任务"""
try
:
data
=
request
.
get_json
()
topics
=
data
.
get
(
'topics'
,
[])
parameters
=
data
.
get
(
'parameters'
,
DEFAULT_CONFIG
)
parameters
=
{
**
DEFAULT_CONFIG
,
**
data
.
get
(
'parameters'
,
{})}
if
not
topics
:
return
jsonify
({
...
...
@@ -162,13 +179,11 @@ def start_spider():
'message'
:
'请选择至少一个话题'
})
# 启动爬虫线程
thread
=
threading
.
Thread
(
target
=
spider_worker
,
args
=
(
topics
,
parameters
),
daemon
=
True
)
thread
.
start
()
# 创建爬虫工作器
worker
=
SpiderWorker
(
topics
,
parameters
)
# 在事件循环中运行爬虫任务
asyncio
.
create_task
(
worker
.
run
())
return
jsonify
({
'success'
:
True
,
...
...
Please
register
or
login
to post a comment