YYL469

优化爬虫代码

@@ -36,7 +36,7 @@ def run_spider_script(): @@ -36,7 +36,7 @@ def run_spider_script():
36 36
37 if __name__ == '__main__': 37 if __name__ == '__main__':
38 scheduler = BackgroundScheduler(timezone=utc) 38 scheduler = BackgroundScheduler(timezone=utc)
39 - scheduler.add_job(run_spider_script, 'interval', hours=5) 39 + scheduler.add_job(run_spider_script, 'interval', minutes=1)
40 scheduler.start() 40 scheduler.start()
41 41
42 try: 42 try:
1 -from spiderContent import start as spiderContent  
2 -from spiderComments import start as spiderComments 1 +from spiderData import spiderData
3 from saveData import save_to_sql as saveData 2 from saveData import save_to_sql as saveData
4 3
5 def main(): 4 def main():
6 - print('正在爬取文章数据')  
7 - spiderContent(1,1)  
8 - print('正在爬取文章评论数据')  
9 - spiderComments() 5 + print('正在爬取数据')
  6 + spiderData()
10 print('正在存储数据') 7 print('正在存储数据')
11 saveData() 8 saveData()
12 print("爬取数据更新") 9 print("爬取数据更新")
  1 +from spiderDataPackage.spiderNav import start as spiderNav
  2 +from spiderDataPackage.spiderContent import start as spiderContent
  3 +from spiderDataPackage.spiderComments import start as spiderComments
  4 +import os
  5 +
  6 +def spiderData():
  7 + if not os.path.exists('./nav.csv'):
  8 + print('正在爬取导航栏数据')
  9 + spiderNav()
  10 + print('正在爬取文章数据')
  11 + spiderContent(1,1)
  12 + print('正在爬取文章评论数据')
  13 + spiderComments()
  14 +
  15 +if __name__ == '__main__':
  16 + spiderData()
  1 +navAddr="./nav.csv"
  2 +articleAddr="./article.csv"
  3 +commentsAddr="./comments.csv"
@@ -3,10 +3,11 @@ import requests @@ -3,10 +3,11 @@ import requests
3 import csv 3 import csv
4 import os 4 import os
5 from datetime import datetime 5 from datetime import datetime
  6 +from settings import articleAddr,commentsAddr
6 7
7 def init(): 8 def init():
8 - if not os.path.exists('./comments.csv'):  
9 - with open('./comments.csv','w',encoding='utf-8',newline='') as csvFile: 9 + if not os.path.exists(commentsAddr):
  10 + with open(commentsAddr,'w',encoding='utf-8',newline='') as csvFile:
10 writer = csv.writer(csvFile) 11 writer = csv.writer(csvFile)
11 writer.writerow([ 12 writer.writerow([
12 'articleId', 13 'articleId',
@@ -21,7 +22,7 @@ def init(): @@ -21,7 +22,7 @@ def init():
21 ]) 22 ])
22 23
23 def write(row): 24 def write(row):
24 - with open('./comments.csv', 'a', encoding='utf-8', newline='') as csvFile: 25 + with open(commentsAddr, 'a', encoding='utf-8', newline='') as csvFile:
25 writer = csv.writer(csvFile) 26 writer = csv.writer(csvFile)
26 writer.writerow(row) 27 writer.writerow(row)
27 28
@@ -38,7 +39,7 @@ def fetchData(url,params): @@ -38,7 +39,7 @@ def fetchData(url,params):
38 39
39 def getArticleList(): 40 def getArticleList():
40 articleList = [] 41 articleList = []
41 - with open('./article.csv','r',encoding='utf-8') as reader: 42 + with open(articleAddr,'r',encoding='utf-8') as reader:
42 readerCsv = csv.reader(reader) 43 readerCsv = csv.reader(reader)
43 next(reader) 44 next(reader)
44 for nav in readerCsv: 45 for nav in readerCsv:
@@ -3,10 +3,11 @@ import requests @@ -3,10 +3,11 @@ import requests
3 import csv 3 import csv
4 import os 4 import os
5 from datetime import datetime 5 from datetime import datetime
  6 +from settings import navAddr,articleAddr
6 7
7 def init(): 8 def init():
8 - if not os.path.exists('./article.csv'):  
9 - with open('./article.csv','w',encoding='utf-8',newline='') as csvFile: 9 + if not os.path.exists(articleAddr):
  10 + with open(articleAddr,'w',encoding='utf-8',newline='') as csvFile:
10 writer = csv.writer(csvFile) 11 writer = csv.writer(csvFile)
11 writer.writerow([ 12 writer.writerow([
12 'id', 13 'id',
@@ -26,7 +27,7 @@ def init(): @@ -26,7 +27,7 @@ def init():
26 ]) 27 ])
27 28
28 def write(row): 29 def write(row):
29 - with open('./article.csv', 'a', encoding='utf-8', newline='') as csvFile: 30 + with open(articleAddr, 'a', encoding='utf-8', newline='') as csvFile:
30 writer = csv.writer(csvFile) 31 writer = csv.writer(csvFile)
31 writer.writerow(row) 32 writer.writerow(row)
32 33
@@ -43,7 +44,7 @@ def fetchData(url,params): @@ -43,7 +44,7 @@ def fetchData(url,params):
43 44
44 def getTypeList(): 45 def getTypeList():
45 typeList = [] 46 typeList = []
46 - with open('./nav.csv','r',encoding='utf-8') as reader: 47 + with open(navAddr,'r',encoding='utf-8') as reader:
47 readerCsv = csv.reader(reader) 48 readerCsv = csv.reader(reader)
48 next(reader) 49 next(reader)
49 for nav in readerCsv: 50 for nav in readerCsv:
@@ -2,10 +2,10 @@ import requests @@ -2,10 +2,10 @@ import requests
2 import csv 2 import csv
3 import numpy as np 3 import numpy as np
4 import os 4 import os
5 - 5 +from settings import navAddr
6 def init(): 6 def init():
7 - if not os.path.exists('./nav.csv'):  
8 - with open('./nav.csv','w',encoding='utf-8',newline='') as csvFile: 7 + if not os.path.exists(navAddr):
  8 + with open(navAddr,'w',encoding='utf-8',newline='') as csvFile:
9 writer = csv.writer(csvFile) 9 writer = csv.writer(csvFile)
10 writer.writerow([ 10 writer.writerow([
11 'typeName', 11 'typeName',
@@ -14,7 +14,7 @@ def init(): @@ -14,7 +14,7 @@ def init():
14 ]) 14 ])
15 15
16 def write(row): 16 def write(row):
17 - with open('./nav.csv', 'a', encoding='utf-8', newline='') as csvFile: 17 + with open(navAddr, 'a', encoding='utf-8', newline='') as csvFile:
18 writer = csv.writer(csvFile) 18 writer = csv.writer(csvFile)
19 writer.writerow(row) 19 writer.writerow(row)
20 20
@@ -45,9 +45,11 @@ def readJson(response): @@ -45,9 +45,11 @@ def readJson(response):
45 containerid 45 containerid
46 ]) 46 ])
47 47
48 -  
49 -if __name__ == '__main__': 48 +def start():
50 init() 49 init()
51 url = 'https://weibo.com/ajax/feed/allGroups' 50 url = 'https://weibo.com/ajax/feed/allGroups'
52 response = fetchData(url) 51 response = fetchData(url)
53 - readJson(response)  
  52 + readJson(response)
  53 +
  54 +if __name__ == '__main__':
  55 + start()