cache_manager.py
11.3 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
import json
import os
import time
import shutil
from datetime import datetime, timedelta
import threading
import queue
from collections import OrderedDict
import pickle
import hashlib
import logging
logger = logging.getLogger('cache_manager')
logger.setLevel(logging.INFO)
class LRUCache:
"""实现LRU (Least Recently Used) 缓存策略"""
def __init__(self, capacity):
self.cache = OrderedDict()
self.capacity = capacity
def get(self, key):
if key not in self.cache:
return None
# 访问元素时,将其移至末尾,表示最近使用
self.cache.move_to_end(key)
return self.cache[key]
def put(self, key, value):
# 如果键已存在,更新值并将其移至末尾
if key in self.cache:
self.cache[key] = value
self.cache.move_to_end(key)
return
# 如果缓存已满,删除最久未使用的项(OrderedDict 的首项)
if len(self.cache) >= self.capacity:
self.cache.popitem(last=False)
# 添加新项至末尾
self.cache[key] = value
def remove(self, key):
if key in self.cache:
del self.cache[key]
def clear(self):
self.cache.clear()
def __len__(self):
return len(self.cache)
def get_all_keys(self):
return list(self.cache.keys())
class CacheManager:
"""两级缓存系统:内存LRU缓存 + 磁盘持久化缓存"""
_instance = None
_lock = threading.Lock()
def __new__(cls, *args, **kwargs):
with cls._lock:
if cls._instance is None:
cls._instance = super(CacheManager, cls).__new__(cls)
return cls._instance
def __init__(self, name="default", memory_capacity=1000, cache_duration=24,
disk_cache_dir="cache", flush_interval=5):
if hasattr(self, 'initialized'):
return
self.name = name
self.memory_cache = LRUCache(memory_capacity)
self.disk_cache_dir = os.path.join(disk_cache_dir, name)
self.cache_duration = timedelta(hours=cache_duration)
self.flush_interval = flush_interval # 定时将内存缓存刷新到磁盘的间隔(分钟)
self.cache_stats = {"hits": 0, "misses": 0, "disk_hits": 0}
self.disk_queue = queue.Queue()
self.initialized = True
# 确保缓存目录存在
os.makedirs(self.disk_cache_dir, exist_ok=True)
# 启动缓存管理线程
self.cleanup_thread = threading.Thread(target=self._cleanup_and_flush_task, daemon=True)
self.cleanup_thread.start()
# 启动磁盘写入线程
self.disk_writer_thread = threading.Thread(target=self._disk_writer_task, daemon=True)
self.disk_writer_thread.start()
logger.info(f"初始化缓存管理器: {name},内存容量: {memory_capacity}项,缓存时间: {cache_duration}小时")
def _get_cache_key(self, key):
"""标准化缓存键"""
if isinstance(key, str):
return key
return hashlib.md5(str(key).encode()).hexdigest()
def _get_disk_path(self, key):
"""获取磁盘缓存路径"""
safe_key = self._get_cache_key(key)
return os.path.join(self.disk_cache_dir, f"{safe_key}.cache")
def _is_cache_valid(self, timestamp):
"""检查缓存是否过期"""
cache_time = datetime.fromtimestamp(timestamp)
return datetime.now() - cache_time < self.cache_duration
def get(self, key):
"""获取缓存数据,首先检查内存,然后检查磁盘"""
cache_key = self._get_cache_key(key)
# 1. 检查内存缓存
cache_data = self.memory_cache.get(cache_key)
if cache_data is not None:
if self._is_cache_valid(cache_data['timestamp']):
self.cache_stats["hits"] += 1
logger.debug(f"内存缓存命中: {key}")
return cache_data['data']
else:
# 过期缓存,从内存中删除
self.memory_cache.remove(cache_key)
# 2. 检查磁盘缓存
disk_path = self._get_disk_path(cache_key)
if os.path.exists(disk_path):
try:
with open(disk_path, 'rb') as f:
cache_data = pickle.load(f)
if self._is_cache_valid(cache_data['timestamp']):
# 从磁盘加载后,放入内存缓存
self.memory_cache.put(cache_key, cache_data)
self.cache_stats["disk_hits"] += 1
logger.debug(f"磁盘缓存命中: {key}")
return cache_data['data']
else:
# 过期缓存,删除磁盘文件
os.remove(disk_path)
except Exception as e:
logger.warning(f"读取磁盘缓存失败: {key}, 错误: {e}")
self.cache_stats["misses"] += 1
logger.debug(f"缓存未命中: {key}")
return None
def set(self, key, data, immediate_disk_write=False):
"""设置缓存数据,同时更新内存和安排磁盘写入"""
cache_key = self._get_cache_key(key)
cache_data = {
'data': data,
'timestamp': datetime.now().timestamp()
}
# 更新内存缓存
self.memory_cache.put(cache_key, cache_data)
# 安排写入磁盘
if immediate_disk_write:
self._write_to_disk(cache_key, cache_data)
else:
self.disk_queue.put((cache_key, cache_data))
logger.debug(f"缓存已设置: {key}")
return True
def invalidate(self, key):
"""使指定键的缓存失效"""
cache_key = self._get_cache_key(key)
# 从内存中删除
self.memory_cache.remove(cache_key)
# 从磁盘中删除
disk_path = self._get_disk_path(cache_key)
if os.path.exists(disk_path):
try:
os.remove(disk_path)
logger.debug(f"缓存已失效: {key}")
except Exception as e:
logger.warning(f"删除磁盘缓存失败: {key}, 错误: {e}")
return True
def clear_all(self):
"""清除所有缓存"""
# 清除内存缓存
self.memory_cache.clear()
# 清除磁盘缓存
try:
shutil.rmtree(self.disk_cache_dir)
os.makedirs(self.disk_cache_dir, exist_ok=True)
logger.info(f"所有缓存已清除: {self.name}")
except Exception as e:
logger.error(f"清除磁盘缓存失败: {e}")
# 重置统计信息
self.cache_stats = {"hits": 0, "misses": 0, "disk_hits": 0}
return True
def get_stats(self):
"""获取缓存统计信息"""
total_requests = self.cache_stats["hits"] + self.cache_stats["misses"]
hit_rate = (self.cache_stats["hits"] / total_requests * 100) if total_requests > 0 else 0
total_hits = self.cache_stats["hits"] + self.cache_stats["disk_hits"]
memory_size = len(self.memory_cache)
disk_size = len([f for f in os.listdir(self.disk_cache_dir) if f.endswith('.cache')])
return {
"name": self.name,
"memory_items": memory_size,
"disk_items": disk_size,
"memory_hits": self.cache_stats["hits"],
"disk_hits": self.cache_stats["disk_hits"],
"misses": self.cache_stats["misses"],
"total_requests": total_requests,
"hit_rate": hit_rate,
"two_level_hit_rate": (total_hits / total_requests * 100) if total_requests > 0 else 0
}
def _write_to_disk(self, cache_key, cache_data):
"""将缓存写入磁盘"""
disk_path = self._get_disk_path(cache_key)
try:
with open(disk_path, 'wb') as f:
pickle.dump(cache_data, f)
return True
except Exception as e:
logger.warning(f"写入磁盘缓存失败: {cache_key}, 错误: {e}")
return False
def _disk_writer_task(self):
"""后台线程,负责将缓存写入磁盘"""
while True:
try:
# 尝试从队列获取条目,超时后继续循环
try:
cache_key, cache_data = self.disk_queue.get(timeout=1)
self._write_to_disk(cache_key, cache_data)
self.disk_queue.task_done()
except queue.Empty:
time.sleep(0.1)
except Exception as e:
logger.error(f"磁盘写入线程出错: {e}")
time.sleep(5) # 发生错误时等待一段时间
def _cleanup_and_flush_task(self):
"""后台线程,负责清理过期缓存和定期刷新内存缓存到磁盘"""
while True:
try:
# 1. 清理过期的内存缓存
current_time = datetime.now()
for key in self.memory_cache.get_all_keys():
cache_data = self.memory_cache.get(key)
if not self._is_cache_valid(cache_data['timestamp']):
self.memory_cache.remove(key)
# 2. 清理过期的磁盘缓存
for filename in os.listdir(self.disk_cache_dir):
if filename.endswith('.cache'):
filepath = os.path.join(self.disk_cache_dir, filename)
try:
with open(filepath, 'rb') as f:
cache_data = pickle.load(f)
if not self._is_cache_valid(cache_data['timestamp']):
os.remove(filepath)
except Exception as e:
# 清理损坏的缓存文件
logger.warning(f"读取缓存文件失败,将删除: {filepath}, 错误: {e}")
os.remove(filepath)
# 3. 将内存缓存刷新到磁盘
# 注意:这会重写已经写入磁盘的缓存,但确保内存和磁盘保持同步
for key in self.memory_cache.get_all_keys():
cache_data = self.memory_cache.get(key)
self._write_to_disk(key, cache_data)
# 每小时执行一次清理
time.sleep(3600)
except Exception as e:
logger.error(f"缓存清理线程出错: {e}")
time.sleep(3600) # 发生错误时也等待一段时间
# 创建不同领域的缓存实例
prediction_cache = CacheManager(name="predictions", memory_capacity=500, cache_duration=24)
sentiment_cache = CacheManager(name="sentiment", memory_capacity=1000, cache_duration=12)
topic_cache = CacheManager(name="topics", memory_capacity=200, cache_duration=6)
user_data_cache = CacheManager(name="user_data", memory_capacity=300, cache_duration=48)
# 向后兼容的别名
PredictionCache = CacheManager
# 为保持向后兼容,我们保留原来的prediction_cache
prediction_cache_old = prediction_cache