YYL469

优化爬虫代码

... ... @@ -36,7 +36,7 @@ def run_spider_script():
if __name__ == '__main__':
scheduler = BackgroundScheduler(timezone=utc)
scheduler.add_job(run_spider_script, 'interval', hours=5)
scheduler.add_job(run_spider_script, 'interval', minutes=1)
scheduler.start()
try:
... ...
from spiderContent import start as spiderContent
from spiderComments import start as spiderComments
from spiderData import spiderData
from saveData import save_to_sql as saveData
def main():
print('正在爬取文章数据')
spiderContent(1,1)
print('正在爬取文章评论数据')
spiderComments()
print('正在爬取数据')
spiderData()
print('正在存储数据')
saveData()
print("爬取数据更新")
... ...
from spiderDataPackage.spiderNav import start as spiderNav
from spiderDataPackage.spiderContent import start as spiderContent
from spiderDataPackage.spiderComments import start as spiderComments
import os
def spiderData():
if not os.path.exists('./nav.csv'):
print('正在爬取导航栏数据')
spiderNav()
print('正在爬取文章数据')
spiderContent(1,1)
print('正在爬取文章评论数据')
spiderComments()
if __name__ == '__main__':
spiderData()
\ No newline at end of file
... ...
navAddr="./nav.csv"
articleAddr="./article.csv"
commentsAddr="./comments.csv"
\ No newline at end of file
... ...
... ... @@ -3,10 +3,11 @@ import requests
import csv
import os
from datetime import datetime
from settings import articleAddr,commentsAddr
def init():
if not os.path.exists('./comments.csv'):
with open('./comments.csv','w',encoding='utf-8',newline='') as csvFile:
if not os.path.exists(commentsAddr):
with open(commentsAddr,'w',encoding='utf-8',newline='') as csvFile:
writer = csv.writer(csvFile)
writer.writerow([
'articleId',
... ... @@ -21,7 +22,7 @@ def init():
])
def write(row):
with open('./comments.csv', 'a', encoding='utf-8', newline='') as csvFile:
with open(commentsAddr, 'a', encoding='utf-8', newline='') as csvFile:
writer = csv.writer(csvFile)
writer.writerow(row)
... ... @@ -38,7 +39,7 @@ def fetchData(url,params):
def getArticleList():
articleList = []
with open('./article.csv','r',encoding='utf-8') as reader:
with open(articleAddr,'r',encoding='utf-8') as reader:
readerCsv = csv.reader(reader)
next(reader)
for nav in readerCsv:
... ...
... ... @@ -3,10 +3,11 @@ import requests
import csv
import os
from datetime import datetime
from settings import navAddr,articleAddr
def init():
if not os.path.exists('./article.csv'):
with open('./article.csv','w',encoding='utf-8',newline='') as csvFile:
if not os.path.exists(articleAddr):
with open(articleAddr,'w',encoding='utf-8',newline='') as csvFile:
writer = csv.writer(csvFile)
writer.writerow([
'id',
... ... @@ -26,7 +27,7 @@ def init():
])
def write(row):
with open('./article.csv', 'a', encoding='utf-8', newline='') as csvFile:
with open(articleAddr, 'a', encoding='utf-8', newline='') as csvFile:
writer = csv.writer(csvFile)
writer.writerow(row)
... ... @@ -43,7 +44,7 @@ def fetchData(url,params):
def getTypeList():
typeList = []
with open('./nav.csv','r',encoding='utf-8') as reader:
with open(navAddr,'r',encoding='utf-8') as reader:
readerCsv = csv.reader(reader)
next(reader)
for nav in readerCsv:
... ...
... ... @@ -2,10 +2,10 @@ import requests
import csv
import numpy as np
import os
from settings import navAddr
def init():
if not os.path.exists('./nav.csv'):
with open('./nav.csv','w',encoding='utf-8',newline='') as csvFile:
if not os.path.exists(navAddr):
with open(navAddr,'w',encoding='utf-8',newline='') as csvFile:
writer = csv.writer(csvFile)
writer.writerow([
'typeName',
... ... @@ -14,7 +14,7 @@ def init():
])
def write(row):
with open('./nav.csv', 'a', encoding='utf-8', newline='') as csvFile:
with open(navAddr, 'a', encoding='utf-8', newline='') as csvFile:
writer = csv.writer(csvFile)
writer.writerow(row)
... ... @@ -45,9 +45,11 @@ def readJson(response):
containerid
])
if __name__ == '__main__':
def start():
init()
url = 'https://weibo.com/ajax/feed/allGroups'
response = fetchData(url)
readJson(response)
\ No newline at end of file
readJson(response)
if __name__ == '__main__':
start()
\ No newline at end of file
... ...