路飞学院-Python爬虫实战密训班-第3章

python"> 
    
          python comments"># -*- coding: utf-8 -*-
         
          python keyword">import 
          python plain">scrapy
         
          python keyword">from 
          python plain">scrapy.http 
          python keyword">import 
          python plain">Request
         
          python comments"># 我们如果要模拟登陆，就必须要拿到cookie
         
          python comments"># response里面有url，text等等，但遗憾的是不想requests，可以直接拿到cookie
         
          python comments"># 但我们可以导入一个模块
         
          python keyword">from 
          python plain">scrapy.http.cookies 
          python keyword">import 
          python plain">CookieJar
         
          python keyword">class 
          python plain">GetChoutiSpider(scrapy.Spider):
         
          python spaces">    
          python plain">name 
          python keyword">= 
          python string">'get_chouti'
         
          python spaces">    
          python plain">allowed_domains 
          python keyword">= 
          python plain">[
          python string">'chouti.com'
          python plain">]
         
          python spaces">    
          python plain">start_urls 
          python keyword">= 
          python plain">[
          python string">'https://dig.chouti.com/'
          python plain">]
         
          python spaces">    
          python plain">cookies 
          python keyword">= 
          python color1">None
         
          python spaces">    
          python keyword">def 
          python plain">parse(
          python color1">self
          python plain">, response):
         
          python spaces">        
          python comments"># 此时只是拿到了一个存储cookie的容器
         
          python spaces">        
          python plain">cookie_obj 
          python keyword">= 
          python plain">CookieJar()
         
          python spaces">        
          python comments"># response表示请求的所有内容，response.request表示我们发的请求
         
          python spaces">        
          python comments"># 接受我们上面说的两个参数
         
          python spaces">        
          python plain">cookie_obj.extract_cookies(response, response.request)
         
          python spaces">        
          python comments"># 那么此时的cookie_obj便保存了我们的cookie信息
         
          python spaces">        
          python functions">print
          python plain">(cookie_obj._cookies)
         
          python spaces">        
          python comments">'''
         
          python spaces">                
          python comments">{'.chouti.com': {'/': {'gpsd': Cookie(version=0, name='gpsd', value='1c61978d6bb94989674386b29f2fd15d', port=None, port_specified=False, domain='.chouti
         
          python spaces">        
          python comments">.com', domain_specified=True, domain_initial_dot=False, path='/', path_specified=True, secure=False, expires=1533183431, discard=False, comment=None, co
         
          python spaces">        
          python comments">mment_url=None, rest={}, rfc2109=False)}}, 'dig.chouti.com': {'/': {'JSESSIONID': Cookie(version=0, name='JSESSIONID', value='aaaouDhGaca3Ugddzblrw', po
         
          python spaces">        
          python comments">rt=None, port_specified=False, domain='dig.chouti.com', domain_specified=False, domain_initial_dot=False, path='/', path_specified=True, secure=False, e
         
          python spaces">        
          python comments">xpires=None, discard=True, comment=None, comment_url=None, rest={}, rfc2109=False)}}}
         
          python spaces">                
          python comments">'''
         
          python spaces">        
          python comments"># 上面便是我们获取的cookie信息
         
          python spaces">        
          python comments"># 将cookie保存起来
         
          python spaces">        
          python color1">self
          python plain">.cookies 
          python keyword">= 
          python plain">cookie_obj._cookies
         
          python spaces">        
          python comments"># 同理request也一样
         
          python spaces">        
          python comments">'''
         
          python spaces">        
          python comments">类似于requests
         
          python spaces">        
          python comments">res = requests.get(xxxxx)
         
          python spaces">        
          python comments">res.cookies._cookies便是返回的cookie信息
         
          python spaces">        
          python comments">'''
         
          python spaces">        
          python comments"># 然后就要模拟登陆了，带上用户名和密码和cookie
         
          python spaces">        
          python keyword">yield 
          python plain">Request(
         
          python spaces">            
          python plain">url
          python keyword">=
          python string">'https://dig.chouti.com/login'
          python plain">,
         
          python spaces">            
          python plain">method
          python keyword">=
          python string">'POST'
          python plain">,
         
          python spaces">            
          python plain">headers
          python keyword">=
          python plain">{
          python string">'content-type'
          python plain">: 
          python string">'application/x-www-form-urlencoded; charset=UTF-8'
          python plain">,
         
          python spaces">                     
          python string">'user-agent'
          python plain">: 
          python string">'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'
          python plain">},
         
          python spaces">            
          python plain">cookies
          python keyword">=
          python color1">self
          python plain">.cookies,
         
          python spaces">            
          python plain">callback
          python keyword">=
          python color1">self
          python plain">.check_login,
         
          python spaces">            
          python comments"># 这里的body类似于requests的data，但是形式不一样，body不能够以字典的形式提交
         
          python spaces">            
          python comments"># 账号密码输入的对的，这里隐藏了
         
          python spaces">            
          python plain">body
          python keyword">=
          python string">'phone=8618xxxxx2459&password=zxxxxxhyyxx&oneMonth=1'
         
          python spaces">        
          python plain">)
         
          python spaces">    
          python comments"># 回调函数，用于检测请求是否发送成功。
         
          python spaces">    
          python comments"># 注意回调函数不能是self.parse，否则回调执行的时候又把请求发过去了
         
          python spaces">    
          python comments"># 里面自动封装了response，就是我们执行成功之后的响应结果
         
          python spaces">    
          python keyword">def 
          python plain">check_login(
          python color1">self
          python plain">, response):
         
          python spaces">        
          python functions">print
          python plain">(response.text)
         
          python spaces">        
          python comments">'''
         
          python spaces">        
          python comments">{"result":{"code":"9999", "message":"", "data":{"complateReg":"0","destJid":"cdu_53059370687"}}}
         
          python spaces">        
          python comments">'''
         
          python spaces">        
          python comments"># 登陆成功
         
          python spaces">        
          python comments"># 接下来进行点赞。
         
          python spaces">        
          python comments"># 登陆页面不需要cookie
         
          python spaces">        
          python comments"># 依旧yield
         
          python spaces">        
          python keyword">yield 
          python plain">Request(
         
          python spaces">            
          python plain">url
          python keyword">=
          python string">'https://dig.chouti.com/'
          python plain">,
         
          python spaces">            
          python plain">callback
          python keyword">=
          python color1">self
          python plain">.like,  
          python comments"># 定义一个用于点赞的回调函数
         
          python spaces">        
          python plain">)
         
          python spaces">    
          python keyword">def 
          python plain">like(
          python color1">self
          python plain">, response):
         
          python spaces">        
          python comments"># 此时的response则是整个页面
         
          python spaces">        
          python plain">id_list 
          python keyword">= 
          python plain">response.xpath(
          python string">'//div[@share-linkid]/@share-linkid'
          python plain">).extract()
         
          python spaces">        
          python keyword">for 
          python plain">nid 
          python keyword">in 
          python plain">id_list:
         
          python spaces">            
          python plain">url 
          python keyword">= 
          python string">'https://dig.chouti.com/link/vote?linksId=%s' 
          python keyword">% 
          python plain">nid
         
          python spaces">            
          python keyword">yield 
          python plain">Request(
         
          python spaces">                
          python plain">url
          python keyword">=
          python plain">url,
         
          python spaces">                
          python plain">method
          python keyword">=
          python string">'POST'
          python plain">,
         
          python spaces">                
          python plain">cookies
          python keyword">=
          python color1">self
          python plain">.cookies,
         
          python spaces">                
          python plain">headers
          python keyword">=
          python plain">{
          python string">'referer'
          python plain">: 
          python string">'https://dig.chouti.com/'
          python plain">},
         
          python spaces">                
          python comments"># 再加一个回调函数，查看是否点赞成功
         
          python spaces">                
          python plain">callback
          python keyword">=
          python color1">self
          python plain">.show_like
         
          python spaces">            
          python plain">)
         
          python spaces">    
         
          python spaces">    
          python keyword">def 
          python plain">show_like(
          python color1">self
          python plain">, response):
         
          python spaces">        
          python functions">print
          python plain">(response.text)
         
          python spaces">

执行成功response.text就会返回该结果

{"result":{"code":"9999", "message":"推荐成功", "data":{"jid":"cdu_53059370687","likedTime":"1530598017650000","lvCount":"24","nick":"古明地盆","uvCount
":"2921","voteTime":"小于1分钟前"}}}
{"result":{"code":"9999", "message":"推荐成功", "data":{"jid":"cdu_53059370687","likedTime":"1530598017657000","lvCount":"34","nick":"古明地盆","uvCount
":"2921","voteTime":"小于1分钟前"}}}

如果点赞成功之后继续执行，就会有如下提示

{"result":{"code":"30010", "message":"你已经推荐过了", "data":""}}

会发现，我只给当前页进行了点赞，如果我想给好多页进行点赞呢？

python"> 
     
           python comments"># -*- coding: utf-8 -*-
          
           python keyword">import 
           python plain">scrapy
          
           python keyword">from 
           python plain">scrapy.http 
           python keyword">import 
           python plain">Request
          
           python comments"># 我们如果要模拟登陆，就必须要拿到cookie
          
           python comments"># response里面有url，text等等，但遗憾的是不想requests，可以直接拿到cookie
          
           python comments"># 但我们可以导入一个模块
          
           python keyword">from 
           python plain">scrapy.http.cookies 
           python keyword">import 
           python plain">CookieJar
          
           python keyword">class 
           python plain">GetChoutiSpider(scrapy.Spider):
          
           python spaces">    
           python plain">name 
           python keyword">= 
           python string">'get_chouti'
          
           python spaces">    
           python plain">allowed_domains 
           python keyword">= 
           python plain">[
           python string">'chouti.com'
           python plain">]
          
           python spaces">    
           python plain">start_urls 
           python keyword">= 
           python plain">[
           python string">'https://dig.chouti.com/'
           python plain">]
          
           python spaces">    
           python plain">cookies 
           python keyword">= 
           python color1">None
          
           python spaces">    
           python keyword">def 
           python plain">parse(
           python color1">self
           python plain">, response):
          
           python spaces">        
           python comments"># 此时只是拿到了一个存储cookie的容器
          
           python spaces">        
           python plain">cookie_obj 
           python keyword">= 
           python plain">CookieJar()
          
           python spaces">        
           python comments"># response表示请求的所有内容，response.request表示我们发的请求
          
           python spaces">        
           python comments"># 接受我们上面说的两个参数
          
           python spaces">        
           python plain">cookie_obj.extract_cookies(response, response.request)
          
           python spaces">        
           python comments"># 那么此时的cookie_obj便保存了我们的cookie信息
          
           python spaces">        
           python functions">print
           python plain">(cookie_obj._cookies)
          
           python spaces">        
           python comments">'''
          
           python spaces">                
           python comments">{'.chouti.com': {'/': {'gpsd': Cookie(version=0, name='gpsd', value='1c61978d6bb94989674386b29f2fd15d', port=None, port_specified=False, domain='.chouti
          
           python spaces">        
           python comments">.com', domain_specified=True, domain_initial_dot=False, path='/', path_specified=True, secure=False, expires=1533183431, discard=False, comment=None, co
          
           python spaces">        
           python comments">mment_url=None, rest={}, rfc2109=False)}}, 'dig.chouti.com': {'/': {'JSESSIONID': Cookie(version=0, name='JSESSIONID', value='aaaouDhGaca3Ugddzblrw', po
          
           python spaces">        
           python comments">rt=None, port_specified=False, domain='dig.chouti.com', domain_specified=False, domain_initial_dot=False, path='/', path_specified=True, secure=False, e
          
           python spaces">        
           python comments">xpires=None, discard=True, comment=None, comment_url=None, rest={}, rfc2109=False)}}}
          
           python spaces">                
           python comments">'''
          
           python spaces">        
           python comments"># 上面便是我们获取的cookie信息
          
           python spaces">        
           python comments"># 将cookie保存起来
          
           python spaces">        
           python color1">self
           python plain">.cookies 
           python keyword">= 
           python plain">cookie_obj._cookies
          
           python spaces">        
           python comments"># 同理request也一样
          
           python spaces">        
           python comments">'''
          
           python spaces">        
           python comments">类似于requests
          
           python spaces">        
           python comments">res = requests.get(xxxxx)
          
           python spaces">        
           python comments">res.cookies._cookies便是返回的cookie信息
          
           python spaces">        
           python comments">'''
          
           python spaces">        
           python comments"># 然后就要模拟登陆了，带上用户名和密码和cookie
          
           python spaces">        
           python keyword">yield 
           python plain">Request(
          
           python spaces">            
           python plain">url
           python keyword">=
           python string">'https://dig.chouti.com/login'
           python plain">,
          
           python spaces">            
           python plain">method
           python keyword">=
           python string">'POST'
           python plain">,
          
           python spaces">            
           python plain">headers
           python keyword">=
           python plain">{
           python string">'content-type'
           python plain">: 
           python string">'application/x-www-form-urlencoded; charset=UTF-8'
           python plain">,
          
           python spaces">                     
           python string">'user-agent'
           python plain">: 
           python string">'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'
           python plain">},
          
           python spaces">            
           python plain">cookies
           python keyword">=
           python color1">self
           python plain">.cookies,
          
           python spaces">            
           python plain">callback
           python keyword">=
           python color1">self
           python plain">.check_login,
          
           python spaces">            
           python comments"># 这里的body类似于requests的data，但是形式不一样，body不能够以字典的形式提交
          
           python spaces">            
           python comments"># 账号密码输入的对的，这里隐藏了
          
           python spaces">            
           python plain">body
           python keyword">=
           python string">'phone=8618xxxxx2459&password=zxxxxxhyyxx&oneMonth=1'
          
           python spaces">        
           python plain">)
          
           python spaces">    
           python comments"># 回调函数，用于检测请求是否发送成功。
          
           python spaces">    
           python comments"># 注意回调函数不能是self.parse，否则回调执行的时候又把请求发过去了
          
           python spaces">    
           python comments"># 里面自动封装了response，就是我们执行成功之后的响应结果
          
           python spaces">    
           python keyword">def 
           python plain">check_login(
           python color1">self
           python plain">, response):
          
           python spaces">        
           python functions">print
           python plain">(response.text)
          
           python spaces">        
           python comments">'''
          
           python spaces">        
           python comments">{"result":{"code":"9999", "message":"", "data":{"complateReg":"0","destJid":"cdu_53059370687"}}}
          
           python spaces">        
           python comments">'''
          
           python spaces">        
           python comments"># 登陆成功
          
           python spaces">        
           python comments"># 接下来进行点赞。
          
           python spaces">        
           python comments"># 登陆页面不需要cookie
          
           python spaces">        
           python comments"># 依旧yield
          
           python spaces">        
           python keyword">yield 
           python plain">Request(
          
           python spaces">            
           python plain">url
           python keyword">=
           python string">'https://dig.chouti.com/'
           python plain">,
          
           python spaces">            
           python plain">callback
           python keyword">=
           python color1">self
           python plain">.like,  
           python comments"># 定义一个用于点赞的回调函数
          
           python spaces">        
           python plain">)
          
           python spaces">    
           python keyword">def 
           python plain">like(
           python color1">self
           python plain">, response):
          
           python spaces">        
           python comments"># 此时的response则是整个页面
          
           python spaces">        
           python plain">id_list 
           python keyword">= 
           python plain">response.xpath(
           python string">'//div[@share-linkid]/@share-linkid'
           python plain">).extract()
          
           python spaces">        
           python keyword">for 
           python plain">nid 
           python keyword">in 
           python plain">id_list:
          
           python spaces">            
           python plain">url 
           python keyword">= 
           python string">'https://dig.chouti.com/link/vote?linksId=%s' 
           python keyword">% 
           python plain">nid
          
           python spaces">            
           python keyword">yield 
           python plain">Request(
          
           python spaces">                
           python plain">url
           python keyword">=
           python plain">url,
          
           python spaces">                
           python plain">method
           python keyword">=
           python string">'POST'
           python plain">,
          
           python spaces">                
           python plain">cookies
           python keyword">=
           python color1">self
           python plain">.cookies,
          
           python spaces">                
           python plain">headers
           python keyword">=
           python plain">{
           python string">'referer'
           python plain">: 
           python string">'https://dig.chouti.com/'
           python plain">},
          
           python spaces">                
           python comments"># 再加一个回调函数，查看是否点赞成功
          
           python spaces">                
           python plain">callback
           python keyword">=
           python color1">self
           python plain">.show_like
          
           python spaces">            
           python plain">)
          
           python spaces">        
           python comments"># 此时点赞只是当前页，点赞，如果我想给每一页都点赞
          
           python spaces">        
           python comments"># 想给多页点赞，找到对应的页码
          
           python spaces">        
           python plain">pages 
           python keyword">= 
           python plain">response.xpath(
           python string">'//div[@id="dig_lcpage"]//a/@href'
           python plain">).extract()
          
           python spaces">        
           python keyword">for 
           python plain">page 
           python keyword">in 
           python plain">pages:
          
           python spaces">            
           python plain">page_url 
           python keyword">= 
           python string">'https://dig.chouti.com%s' 
           python keyword">% 
           python plain">page
          
           python spaces">            
           python keyword">yield 
           python plain">Request(
          
           python spaces">                
           python plain">url
           python keyword">=
           python plain">page_url,
          
           python spaces">                
           python comments"># 注意这里的callback，是函数本身
          
           python spaces">                
           python comments"># 这里找到所有的页码，比如page_url此时处于第二页
          
           python spaces">                
           python comments"># 那么再调用自己，就会给第二页点赞，点完赞之后，page_url就会变成第三页
          
           python spaces">                
           python comments"># 那么再调用自己，就会给第三页点赞，以此往复
          
           python spaces">                
           python plain">callback
           python keyword">=
           python color1">self
           python plain">.like
          
           python spaces">                
           python comments"># 抽屉貌似有一百二十多页，我们这里指定以下递归的深度，只爬取四个深度
          
           python spaces">            
           python plain">)
          
           python spaces">    
           python keyword">def 
           python plain">show_like(
           python color1">self
           python plain">, response):
          
           python spaces">        
           python functions">print
           python plain">(response.text)