爬取空气质量网
spider
python">import scrapy
from urllib import parse
from selenium import webdriver
from air_qualityPro.items import AirQualityproItem
city_names = ['郑州','开封','洛阳','平顶山','鹤壁','新乡','焦作','濮阳','许昌','漯河','三门峡','南阳','商丘','信阳','周口','驻马店']
class AirQualitySpider(scrapy.Spider):
name = 'air_quality'
# allowed_domains = ['www.xx.com']
start_urls = ['https://www.aqistudy.cn/historydata/']
# 河南17个城市的url
city_urls = []
# 实例化浏览器对象
def __init__(self):
self.browser = webdriver.Chrome(executable_path=r'D:\Python work space\ Reptile\python期末作业\chromedriver.exe')
self.browser.execute_cdp_cmd('Page.addScriptToEvaluateOnNewDocument', {
'source': 'Object.defineProperty(navigator, "webdriver", {get: () => undefined})'
})
# 解析到河南17个城市的URL
def parse(self, response):
li_list = response.xpath('/html/body/div[3]/div/div[1]/div[2]/div[2]/ul/div[2]/li')
for li in li_list:
city = li.xpath('./a/text()').extract_first()
if city in city_names:
# 城市的URL https://www.aqistudy.cn/historydata/monthdata.php?city=%E9%83%91%E5%B7%9E
self.city_urls.append('https://www.aqistudy.cn/historydata/monthdata.php?city='+parse.quote(city))
for city_url in self.city_urls:
print(city_url)
yield scrapy.Request(url=city_url, callback=self.parse_month)
break
# 解析到每个城市每个月对应的平均的空气质量
def parse_month(self, response):
# print(response.text)
li_list1 = response.xpath('/html/body/div[3]/div[1]/div[1]/table/tbody/tr')
city_name = response.xpath('/html/body/h2/text()').extract_first()
day_time = []
AQI = []
RANGE = []
PM2_5 = []
PM10 = []
NO2 = []
CO = []
SO2 = []
O3 = []
Quality = []
for k in li_list1:
day_time.append(k.xpath('./td[2]/a/text()').extract_first())
AQI.append(k.xpath('./td[3]/text()').extract_first())
RANGE.append(k.xpath('./td[10]/text()').extract_first())
PM2_5.append(k.xpath('./td[12]/text()').extract_first())
PM10.append(k.xpath('./td[13]/text()').extract_first())
CO.append(k.xpath('./td[17]/text()').extract_first()) #//*[@id="body"]/div[3]/div[1]/div[1]/table/tbody/tr[2]/td[17]
NO2.append(k.xpath('./td[21]/text()').extract_first()) # //*[@id="body"]/div[3]/div[1]/div[1]/table/tbody/tr[2]/td[23] /html/body/div[3]/div[1]/div[1]/table/tbody/tr[2]/td[21]
SO2.append(k.xpath('./td[18]/text()').extract_first()) # //*[@id="body"]/div[3]/div[1]/div[1]/table/tbody/tr[2]/td[18]
O3.append(k.xpath('./td[22]/text()').extract_first()) #/div[1]/table/tbody/tr[2]/td[19] /div[1]/div[1]/table/tbody/tr[3]/td[23]
Quality.append(k.xpath('./td[11]/span/text()').extract_first()) # //*[@id="body"]/div[3]/div[1]/div[1]/table/tbody/tr[2]/td[11]/span
item = AirQualityproItem()
item['city_name'] = city_name
item['AQI'] = AQI
item['RANGE'] = RANGE
item['PM2_5'] = PM2_5
item['PM10'] = PM10
item['NO2'] = NO2
item['CO'] = CO
item['SO2'] = SO2
item['SO2'] = SO2
item['O3'] = O3
yield item
def closed(self,spider):
self.browser.quit()