_store_impl.py
5.92 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则:
# 1. 不得用于任何商业用途。
# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。
# 3. 不得进行大规模爬取或对平台造成运营干扰。
# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。
# 5. 不得用于任何非法或不当的用途。
#
# 详细许可条款请参阅项目根目录下的LICENSE文件。
# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
# -*- coding: utf-8 -*-
# @Author : persist1@126.com
# @Time : 2025/9/5 19:34
# @Desc : 贴吧存储实现类
import asyncio
import csv
import json
import os
import pathlib
from typing import Dict
import aiofiles
from sqlalchemy import select
from sqlalchemy.ext.asyncio import AsyncSession
import config
from base.base_crawler import AbstractStore
from database.models import TiebaNote, TiebaComment, TiebaCreator
from tools import utils, words
from database.db_session import get_session
from var import crawler_type_var
from tools.async_file_writer import AsyncFileWriter
def calculate_number_of_files(file_store_path: str) -> int:
"""计算数据保存文件的前部分排序数字,支持每次运行代码不写到同一个文件中
Args:
file_store_path;
Returns:
file nums
"""
if not os.path.exists(file_store_path):
return 1
try:
return max([int(file_name.split("_")[0]) for file_name in os.listdir(file_store_path)]) + 1
except ValueError:
return 1
class TieBaCsvStoreImplement(AbstractStore):
def __init__(self, **kwargs):
super().__init__(**kwargs)
self.writer = AsyncFileWriter(platform="tieba", crawler_type=crawler_type_var.get())
async def store_content(self, content_item: Dict):
"""
tieba content CSV storage implementation
Args:
content_item: note item dict
Returns:
"""
await self.writer.write_to_csv(item_type="contents", item=content_item)
async def store_comment(self, comment_item: Dict):
"""
tieba comment CSV storage implementation
Args:
comment_item: comment item dict
Returns:
"""
await self.writer.write_to_csv(item_type="comments", item=comment_item)
async def store_creator(self, creator: Dict):
"""
tieba content CSV storage implementation
Args:
creator: creator dict
Returns:
"""
await self.writer.write_to_csv(item_type="creators", item=creator)
class TieBaDbStoreImplement(AbstractStore):
async def store_content(self, content_item: Dict):
"""
tieba content DB storage implementation
Args:
content_item: content item dict
"""
note_id = content_item.get("note_id")
async with get_session() as session:
stmt = select(TiebaNote).where(TiebaNote.note_id == note_id)
res = await session.execute(stmt)
db_note = res.scalar_one_or_none()
if db_note:
for key, value in content_item.items():
setattr(db_note, key, value)
else:
db_note = TiebaNote(**content_item)
session.add(db_note)
await session.commit()
async def store_comment(self, comment_item: Dict):
"""
tieba content DB storage implementation
Args:
comment_item: comment item dict
"""
comment_id = comment_item.get("comment_id")
async with get_session() as session:
stmt = select(TiebaComment).where(TiebaComment.comment_id == comment_id)
res = await session.execute(stmt)
db_comment = res.scalar_one_or_none()
if db_comment:
for key, value in comment_item.items():
setattr(db_comment, key, value)
else:
db_comment = TiebaComment(**comment_item)
session.add(db_comment)
await session.commit()
async def store_creator(self, creator: Dict):
"""
tieba content DB storage implementation
Args:
creator: creator dict
"""
user_id = creator.get("user_id")
async with get_session() as session:
stmt = select(TiebaCreator).where(TiebaCreator.user_id == user_id)
res = await session.execute(stmt)
db_creator = res.scalar_one_or_none()
if db_creator:
for key, value in creator.items():
setattr(db_creator, key, value)
else:
db_creator = TiebaCreator(**creator)
session.add(db_creator)
await session.commit()
class TieBaJsonStoreImplement(AbstractStore):
def __init__(self, **kwargs):
super().__init__(**kwargs)
self.writer = AsyncFileWriter(platform="tieba", crawler_type=crawler_type_var.get())
async def store_content(self, content_item: Dict):
"""
tieba content JSON storage implementation
Args:
content_item: note item dict
Returns:
"""
await self.writer.write_single_item_to_json(item_type="contents", item=content_item)
async def store_comment(self, comment_item: Dict):
"""
tieba comment JSON storage implementation
Args:
comment_item: comment item dict
Returns:
"""
await self.writer.write_single_item_to_json(item_type="comments", item=comment_item)
async def store_creator(self, creator: Dict):
"""
tieba content JSON storage implementation
Args:
creator: creator dict
Returns:
"""
await self.writer.write_single_item_to_json(item_type="creators", item=creator)
class TieBaSqliteStoreImplement(TieBaDbStoreImplement):
"""
Tieba sqlite store implement
"""
pass