YYL469

【spiderNav.py】实现导航栏爬取

  1 +import requests
  2 +import csv
  3 +import numpy as np
  4 +import os
  5 +
  6 +def init():
  7 + if not os.path.exists('./navData.csv'):
  8 + with open('./navData.csv','w',encoding='utf-8',newline='') as csvFile:
  9 + writer = csv.writer(csvFile)
  10 + writer.writerow([
  11 + 'typeName',
  12 + 'gid',
  13 + 'containerid'
  14 + ])
  15 +
  16 +def writerRow(row):
  17 + with open('./navData.csv', 'a', encoding='utf-8', newline='') as csvFile:
  18 + writer = csv.writer(csvFile)
  19 + writer.writerow(row)
  20 +
  21 +def get_data(url):
  22 + headers = {
  23 + 'Cookie':'SINAGLOBAL=2555941826014.1074.1676801766625; ULV=1719829459275:6:1:2:4660996305989.918.1719827559898:1719743122299; UOR=,,www.baidu.com; XSRF-TOKEN=VtLXviYSIs8lor7sz4iGyigL; SUB=_2A25LhvU9DeRhGeFH6FIX-S3MyD2IHXVo-gj1rDV8PUJbkNAGLRXMkW1Ne2nhI3Gle25QJK0Z99J3trq_NZn6YKJ-; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9WW3Mv8V5EupQbbKh.vaZIwU5JpX5KzhUgL.FoM4e05c1Ke7e022dJLoIp7LxKML1KBLBKnLxKqL1hnLBoM41hz41hqReKqN; WBPSESS=Dt2hbAUaXfkVprjyrAZT_LRaDLsnxG-kIbeYwnBb5OUKZiwfVr_UrcYfWuqG-4ZVDM5HeU3HXkDNK_thfRfdS9Ao6ezT30jDksv-CpaVmlTAqGUHjJ7PYkH5aCK4HLxmRq14ZalmQNwzfWMPa4y0VNRLuYdg7L1s49ymNq_5v5vusoz0r4ki6u-MHGraF0fbUTgX14x0kHayEwOoxfLI-w==; SCF=AqmJWo31oFV5itnRgWNU1-wHQTL6PmkBLf3gDuqpdqAIfaWguDTMre6Oxjf5Uzs74JAh2r0DdV1sJ1g6m-wJ5NQ.; _s_tentry=-; Apache=4660996305989.918.1719827559898; PC_TOKEN=7955a7ab1f; appkey=; geetest_token=602cd4e3a7ed1898808f8adfe1a2048b; ALF=1722421868',
  24 + 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:127.0) Gecko/20100101 Firefox/127.0'
  25 + }
  26 + params = {
  27 + 'is_new_segment':1,
  28 + 'fetch_hot':1
  29 + }
  30 + response = requests.get(url,headers=headers,params=params)
  31 + if response.status_code == 200:
  32 + return response.json()
  33 + else:
  34 + return None
  35 +
  36 +def parse_json(response):
  37 + navList = np.append(response['groups'][3]['group'],response['groups'][4]['group'])
  38 + for nav in navList:
  39 + navName = nav['title']
  40 + gid = nav['gid']
  41 + containerid = nav['containerid']
  42 + writerRow([
  43 + navName,
  44 + gid,
  45 + containerid
  46 + ])
  47 +
  48 +
  49 +if __name__ == '__main__':
  50 + init()
  51 + url = 'https://weibo.com/ajax/feed/allGroups'
  52 + response = get_data(url)
  53 + parse_json(response)