爬取空气质量网(待优化)

news/2024/7/19 9:00:16 标签: python, 爬虫

爬取空气质量网

spider

python">import scrapy
from urllib import parse
from selenium import webdriver
from air_qualityPro.items import AirQualityproItem


city_names = ['郑州','开封','洛阳','平顶山','鹤壁','新乡','焦作','濮阳','许昌','漯河','三门峡','南阳','商丘','信阳','周口','驻马店']


class AirQualitySpider(scrapy.Spider):
    name = 'air_quality'
    # allowed_domains = ['www.xx.com']
    start_urls = ['https://www.aqistudy.cn/historydata/']
    # 河南17个城市的url
    city_urls = []

    # 实例化浏览器对象
    def __init__(self):
        self.browser = webdriver.Chrome(executable_path=r'D:\Python work space\ Reptile\python期末作业\chromedriver.exe')
        self.browser.execute_cdp_cmd('Page.addScriptToEvaluateOnNewDocument', {
            'source': 'Object.defineProperty(navigator, "webdriver", {get: () => undefined})'
        })

    # 解析到河南17个城市的URL
    def parse(self, response):
        li_list = response.xpath('/html/body/div[3]/div/div[1]/div[2]/div[2]/ul/div[2]/li')
        for li in li_list:
            city = li.xpath('./a/text()').extract_first()
            if city in city_names:
                # 城市的URL https://www.aqistudy.cn/historydata/monthdata.php?city=%E9%83%91%E5%B7%9E
                self.city_urls.append('https://www.aqistudy.cn/historydata/monthdata.php?city='+parse.quote(city))

        for city_url in self.city_urls:
            print(city_url)
            yield scrapy.Request(url=city_url, callback=self.parse_month)
            break

    # 解析到每个城市每个月对应的平均的空气质量
  
    def parse_month(self, response):
        # print(response.text)

        li_list1 = response.xpath('/html/body/div[3]/div[1]/div[1]/table/tbody/tr')
        city_name = response.xpath('/html/body/h2/text()').extract_first()
        day_time = []
        AQI = []
        RANGE = []
        PM2_5 = []
        PM10 = []
        NO2 = []
        CO = []
        SO2 = []
        O3 = []
        Quality = []
        for k in li_list1:
            day_time.append(k.xpath('./td[2]/a/text()').extract_first())
            AQI.append(k.xpath('./td[3]/text()').extract_first())
            RANGE.append(k.xpath('./td[10]/text()').extract_first())
            PM2_5.append(k.xpath('./td[12]/text()').extract_first())
            PM10.append(k.xpath('./td[13]/text()').extract_first())
            CO.append(k.xpath('./td[17]/text()').extract_first())    #//*[@id="body"]/div[3]/div[1]/div[1]/table/tbody/tr[2]/td[17]
            NO2.append(k.xpath('./td[21]/text()').extract_first())    # //*[@id="body"]/div[3]/div[1]/div[1]/table/tbody/tr[2]/td[23]  /html/body/div[3]/div[1]/div[1]/table/tbody/tr[2]/td[21]
            SO2.append(k.xpath('./td[18]/text()').extract_first())   # //*[@id="body"]/div[3]/div[1]/div[1]/table/tbody/tr[2]/td[18]
            O3.append(k.xpath('./td[22]/text()').extract_first())    #/div[1]/table/tbody/tr[2]/td[19] /div[1]/div[1]/table/tbody/tr[3]/td[23]
            Quality.append(k.xpath('./td[11]/span/text()').extract_first())  # //*[@id="body"]/div[3]/div[1]/div[1]/table/tbody/tr[2]/td[11]/span
        item = AirQualityproItem()
        item['city_name'] = city_name
        item['AQI'] = AQI
        item['RANGE'] = RANGE
        item['PM2_5'] = PM2_5
        item['PM10'] = PM10
        item['NO2'] = NO2
        item['CO'] = CO
        item['SO2'] = SO2
        item['SO2'] = SO2
        item['O3'] = O3
        yield item



    def closed(self,spider):
        self.browser.quit()



http://www.niftyadmin.cn/n/1371812.html

相关文章

旋转数组的最小数字、二叉搜索树节点最小距离

剑指 Offer 11. 旋转数组的最小数字 def minArray(self, numbers: List[int]) -> int:num1 numbers[0]num2 numbers[0]for i in range(len(numbers)-1):if numbers[i] > numbers[i1]:num2 numbers[i1]breakreturn min(num1,num2)leetCode 783 二叉搜索树节点最小距离 …

matplotlib绘图基础

matplotlib绘图 plt.plot()绘制线性图 绘制单条线性图 绘制多条线性图 设置坐标系的比例plt.figure(figsize(a,b)) 设置图列legend() 设置轴的标识 图例保存 fig plt.figure()pllt.plot(x,y)figure.savefig() 柱状图:plt.bar() 参数:第一…

解决Scrapy请求丢失问题

在使用Scrapy爬取多页数据时,容易出现丢失请求,数据爬取不完整的问题 def parse_city(self, response):month_urls []li_list response.xpath(/html/body/div[7]/div[1]/div[13]/div/div/ul/li/a/href).extract()for li in li_list:day_q li[-11:-5]i…

react ref的三种写法

有两个输入框 第一个点击按钮会弹出输入框里面的内容 第二个失去焦点会显示里面的内容 第一种&#xff08;最早的写法&#xff0c;以后可能会去掉&#xff09; <!DOCTYPE html> <html lang"en"><head><meta charset"UTF-8"><…

python基础小游戏

python基础小游戏 import randomprint("*"*10"唐僧大战白骨精""*"*10) name input(请选择你的身份:\n\t1.唐僧\n\t2.白骨精\n请选择&#xff1a;) if name 2:print("咦你竟然选择白骨精&#xff0c;就不让你是白骨精") elif name …

python提取字符串中的数字

利用正则表达式提取字符串中的数字 import re str_ "我11是个32字符串&#xff0c;我中4间有677数字88" number re.findall("\d",str_) # 输出结果为列表 # 列表中的数字的数据类型是str # [11, 32, 4, 677, 88]number [int(x) for x in number] #…

python实现炫酷字母雨

python实现炫酷字母雨 import random, pygamePANEL_width 800 PANEL_highly 500 FONT_PX 15 pygame.init() # 创建一个窗口 winSur pygame.display.set_mode() font pygame.font.SysFont(123.ttf, 22) bg_suface pygame.Surface((1920,1080), flagspygame.SRCALPHA) pyg…