Scrapy实现爬取新浪微博用户信息（爬虫结果写入mongodb）

爬取字段有：

微博ID
微博昵称
性别
地区信息
认证信息
个性签名
发表微博个数
粉丝个数
关注个数

spiders文件夹下microID_Spider.py这样写：

# -*- coding: utf-8 -*-
import scrapy
from scrapy.selector import Selector
from blogSpider.items import blogIDItem

class MicroidSpiderSpider(scrapy.Spider):
    name = 'microID_Spider'
    allowed_domains = ['weibo.cn']
    start_urls = ['https://weibo.cn/search']
    # 默认50页
    max_page = 50
    myCookie = 'xxxxxxx'
    headers = {
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
        'Accept-Encoding': 'gzip, deflate, br',
        'Accept-Language': 'zh-CN,zh;q=0.9',
        'Cache-Control': ' max-age=0',
        'Connection': ' keep-alive',
        'Content-Type': ' application/x-www-form-urlencoded',
        'Host': ' weibo.cn',
        'Origin': ' https://weibo.cn',
        'Upgrade-Insecure-Requests': ' 1',
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.122 Safari/537.36',
    }

    def start_requests(self):
        cookie = {}
        for i in self.myCookie.split(';')[:-1]:
            cookie[i.split('=')[0]] = i.split('=')[1]

        blogID = '罗志祥'
        for i in range(1, self.max_page+1):
            url = '{url}/user/?keyword={blogID}&page={pageNum}'.format(url=self.start_urls[0], blogID=blogID, pageNum=i)
            yield scrapy.FormRequest(
                url,
                headers=self.headers,
                cookies=cookie,
                callback=self.write_BlogID,
            )

    def write_BlogID(self, response):
        pageInfo = Selector(response)
        # print(response.body.decode('utf-8'))
        # 所有用户信息都存放在HTML table标签中，所以获取页面中的所有table
        all_Table = pageInfo.xpath('//table')
        cookie = {}
        for i in self.myCookie.split(';')[:-1]:
            cookie[i.split('=')[0]] = i.split('=')[1]
        # print(all_Table)
        for table in all_Table:
            ID_href = table.css('a::attr(href)').extract()[0]
            # print(ID_href.split('?'))
            # print(type(ID_href))
            url = 'https://weibo.cn' + ID_href.split('?')[0]
            # print(url)
            yield scrapy.Request(
                url,
                headers=self.headers,
                cookies=cookie,
                callback=self.getBlogIDinfo,
            )


    def getBlogIDinfo(self, response):
        # print(response.body.decode('utf-8'))
        # print(response.url)
        blogID_Info = blogIDItem()
        # 微博ID存放在URL里，从URL提取就可以
        blogID_Info['ID'] = response.url.split('/')[len(response.url.split('/'))-1]
        pageInfo = Selector(response)
        ut_div = pageInfo.xpath('//div[@class="ut"]')
        spans = ut_div.xpath('span[@class="ctt"]/text()').extract()
        # print(len(spans))
        # print(spans)
        # 四种情况
        if len(spans) == 1:
            # 第一种，只有昵称、性别、地址信息，形如['羅誌祥\xa0男/台湾    \xa0    ']，数组长度为1
            firstRowInfo = spans[0].split(u'\xa0')
            # 微博昵称
            blogID_Info['blogName'] = firstRowInfo[0].replace(u'\xa0', u' ')
            # 性别
            blogID_Info['sex'] = firstRowInfo[1][:firstRowInfo[1].index('/')]
            # 地址
            blogID_Info['location'] = firstRowInfo[1][firstRowInfo[1].index('/')+1:].strip(' ')
            # 认证信息为空
            blogID_Info['identification'] = ''
            # 个性签名为空
            blogID_Info['personal_sign'] = ''
        elif len(spans) == 2:
            firstRowInfo = spans[0].split(u'\xa0')
            blogID_Info['blogName'] = firstRowInfo[0].replace(u'\xa0', u' ')
            blogID_Info['sex'] = firstRowInfo[1][:firstRowInfo[1].index('/')]
            blogID_Info['location'] = firstRowInfo[1][firstRowInfo[1].index('/') + 1:].strip(' ')
            if spans[1].find('认证') == -1:
                # 认证信息为空
                blogID_Info['identification'] = ''
                # 个性签名不为空
                blogID_Info['personal_sign'] = spans[1].replace(u'\u301c', u' ')
            else:
                # 认证信息不为空
                blogID_Info['identification'] = spans[1].replace(u'\u301c', u' ')
                # 个性签名为空
                blogID_Info['personal_sign'] = ''
        elif len(spans) == 3:
            blogID_Info['blogName'] = spans[0].replace(u'\xa0', u' ')
            secondRowInfo = spans[1].split(u'\xa0')
            blogID_Info['sex'] = secondRowInfo[1][:secondRowInfo[1].index('/')]
            blogID_Info['location'] = secondRowInfo[1][secondRowInfo[1].index('/') + 1:].strip(' ')
            if spans[2].find('认证') == -1:
                # 认证信息为空
                blogID_Info['identification'] = ''
                # 个性签名不为空
                blogID_Info['personal_sign'] = spans[2].replace(u'\u301c', u' ')
            else:
                # 认证信息不为空
                blogID_Info['identification'] = spans[2].replace(u'\u301c', u' ')
                # 个性签名为空
                blogID_Info['personal_sign'] = ''
        elif len(spans) == 4:
            blogID_Info['blogName'] = spans[0].replace(u'\xa0', u' ')
            secondRowInfo = spans[1].split(u'\xa0')
            blogID_Info['sex'] = secondRowInfo[1][:secondRowInfo[1].index('/')]
            blogID_Info['location'] = secondRowInfo[1][secondRowInfo[1].index('/') + 1:].strip(' ')
            blogID_Info['identification'] = spans[2].replace(u'\u301c', u' ')
            blogID_Info['personal_sign'] = spans[3].replace(u'\u301c', u' ')
        # print(blogID_Info['blogName'])
        blogNumInfo = pageInfo.xpath('//span[@class="tc"]/text()').extract()
        # print(blogNumInfo)
        tip2 = pageInfo.xpath('//div[@class="tip2"]')
        focusInfo = tip2.xpath('a[1]/text()').extract()
        # print(focusInfo)
        fansInfo = tip2.xpath('a[2]/text()').extract()
        # print(fansInfo)
        blogID_Info['blog_Num'] = blogNumInfo[0][blogNumInfo[0].index('[')+1:blogNumInfo[0].index(']')]
        blogID_Info['focus_Num'] = focusInfo[0][focusInfo[0].index('[')+1:focusInfo[0].index(']')]
        blogID_Info['fans_Num'] = fansInfo[0][fansInfo[0].index('[')+1:fansInfo[0].index(']')]
        # print(blogID_Info)
        yield blogID_Info

items.py这样写：

# -*- coding: utf-8 -*-

# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html

import scrapy

class blogIDItem(scrapy.Item):
    '''
    微博用户信息
    '''
    collection = 'blogID_Data'
    ID = scrapy.Field()
    blogName = scrapy.Field()
    sex = scrapy.Field()
    location = scrapy.Field()
    identification = scrapy.Field()
    personal_sign = scrapy.Field()
    blog_Num = scrapy.Field()
    fans_Num = scrapy.Field()
    focus_Num = scrapy.Field()

Scrapy实现爬取新浪微博用户信息（爬虫结果写入mongodb）

相关文章

发布Scrapy项目到scrapyd

Django调用Scrapy爬虫实现异步爬虫（前端输入爬虫字段信息，后端执行爬虫过程）

修改Tomcat中的server.xml文件，使得get请求中的字符编码为UTF-8

有效的jQuery和bootstrap链接

hadoop集群启动之后safe mode is on问题解决_2020-09-16

Error:(3, 41) java: 程序包org.apache.kafka.clients.producer不存在错误提示解决办法

Windows Python pip修改源

echarts鼠标悬停tooltip显示内容的位置自适应