python">
python comments"># -*- coding: utf-8 -*-
python keyword">import
python plain">scrapy
python keyword">from
python plain">scrapy.http
python keyword">import
python plain">Request
python comments"># 我们如果要模拟登陆,就必须要拿到cookie
python comments"># response里面有url,text等等,但遗憾的是不想requests,可以直接拿到cookie
python comments"># 但我们可以导入一个模块
python keyword">from
python plain">scrapy.http.cookies
python keyword">import
python plain">CookieJar
python keyword">class
python plain">GetChoutiSpider(scrapy.Spider):
python spaces">
python plain">name
python keyword">=
python string">'get_chouti'
python spaces">
python plain">allowed_domains
python keyword">=
python plain">[
python string">'chouti.com'
python plain">]
python spaces">
python plain">start_urls
python keyword">=
python plain">[
python string">'https://dig.chouti.com/'
python plain">]
python spaces">
python plain">cookies
python keyword">=
python color1">None
python spaces">
python keyword">def
python plain">parse(
python color1">self
python plain">, response):
python spaces">
python comments"># 此时只是拿到了一个存储cookie的容器
python spaces">
python plain">cookie_obj
python keyword">=
python plain">CookieJar()
python spaces">
python comments"># response表示请求的所有内容,response.request表示我们发的请求
python spaces">
python comments"># 接受我们上面说的两个参数
python spaces">
python plain">cookie_obj.extract_cookies(response, response.request)
python spaces">
python comments"># 那么此时的cookie_obj便保存了我们的cookie信息
python spaces">
python functions">print
python plain">(cookie_obj._cookies)
python spaces">
python comments">'''
python spaces">
python comments">{'.chouti.com': {'/': {'gpsd': Cookie(version=0, name='gpsd', value='1c61978d6bb94989674386b29f2fd15d', port=None, port_specified=False, domain='.chouti
python spaces">
python comments">.com', domain_specified=True, domain_initial_dot=False, path='/', path_specified=True, secure=False, expires=1533183431, discard=False, comment=None, co
python spaces">
python comments">mment_url=None, rest={}, rfc2109=False)}}, 'dig.chouti.com': {'/': {'JSESSIONID': Cookie(version=0, name='JSESSIONID', value='aaaouDhGaca3Ugddzblrw', po
python spaces">
python comments">rt=None, port_specified=False, domain='dig.chouti.com', domain_specified=False, domain_initial_dot=False, path='/', path_specified=True, secure=False, e
python spaces">
python comments">xpires=None, discard=True, comment=None, comment_url=None, rest={}, rfc2109=False)}}}
python spaces">
python comments">'''
python spaces">
python comments"># 上面便是我们获取的cookie信息
python spaces">
python comments"># 将cookie保存起来
python spaces">
python color1">self
python plain">.cookies
python keyword">=
python plain">cookie_obj._cookies
python spaces">
python comments"># 同理request也一样
python spaces">
python comments">'''
python spaces">
python comments">类似于requests
python spaces">
python comments">res = requests.get(xxxxx)
python spaces">
python comments">res.cookies._cookies便是返回的cookie信息
python spaces">
python comments">'''
python spaces">
python comments"># 然后就要模拟登陆了,带上用户名和密码和cookie
python spaces">
python keyword">yield
python plain">Request(
python spaces">
python plain">url
python keyword">=
python string">'https://dig.chouti.com/login'
python plain">,
python spaces">
python plain">method
python keyword">=
python string">'POST'
python plain">,
python spaces">
python plain">headers
python keyword">=
python plain">{
python string">'content-type'
python plain">:
python string">'application/x-www-form-urlencoded; charset=UTF-8'
python plain">,
python spaces">
python string">'user-agent'
python plain">:
python string">'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'
python plain">},
python spaces">
python plain">cookies
python keyword">=
python color1">self
python plain">.cookies,
python spaces">
python plain">callback
python keyword">=
python color1">self
python plain">.check_login,
python spaces">
python comments"># 这里的body类似于requests的data,但是形式不一样,body不能够以字典的形式提交
python spaces">
python comments"># 账号密码输入的对的,这里隐藏了
python spaces">
python plain">body
python keyword">=
python string">'phone=8618xxxxx2459&password=zxxxxxhyyxx&oneMonth=1'
python spaces">
python plain">)
python spaces">
python comments"># 回调函数,用于检测请求是否发送成功。
python spaces">
python comments"># 注意回调函数不能是self.parse,否则回调执行的时候又把请求发过去了
python spaces">
python comments"># 里面自动封装了response,就是我们执行成功之后的响应结果
python spaces">
python keyword">def
python plain">check_login(
python color1">self
python plain">, response):
python spaces">
python functions">print
python plain">(response.text)
python spaces">
python comments">'''
python spaces">
python comments">{"result":{"code":"9999", "message":"", "data":{"complateReg":"0","destJid":"cdu_53059370687"}}}
python spaces">
python comments">'''
python spaces">
python comments"># 登陆成功
python spaces">
python comments"># 接下来进行点赞。
python spaces">
python comments"># 登陆页面不需要cookie
python spaces">
python comments"># 依旧yield
python spaces">
python keyword">yield
python plain">Request(
python spaces">
python plain">url
python keyword">=
python string">'https://dig.chouti.com/'
python plain">,
python spaces">
python plain">callback
python keyword">=
python color1">self
python plain">.like,
python comments"># 定义一个用于点赞的回调函数
python spaces">
python plain">)
python spaces">
python keyword">def
python plain">like(
python color1">self
python plain">, response):
python spaces">
python comments"># 此时的response则是整个页面
python spaces">
python plain">id_list
python keyword">=
python plain">response.xpath(
python string">'//div[@share-linkid]/@share-linkid'
python plain">).extract()
python spaces">
python keyword">for
python plain">nid
python keyword">in
python plain">id_list:
python spaces">
python plain">url
python keyword">=
python string">'https://dig.chouti.com/link/vote?linksId=%s'
python keyword">%
python plain">nid
python spaces">
python keyword">yield
python plain">Request(
python spaces">
python plain">url
python keyword">=
python plain">url,
python spaces">
python plain">method
python keyword">=
python string">'POST'
python plain">,
python spaces">
python plain">cookies
python keyword">=
python color1">self
python plain">.cookies,
python spaces">
python plain">headers
python keyword">=
python plain">{
python string">'referer'
python plain">:
python string">'https://dig.chouti.com/'
python plain">},
python spaces">
python comments"># 再加一个回调函数,查看是否点赞成功
python spaces">
python plain">callback
python keyword">=
python color1">self
python plain">.show_like
python spaces">
python plain">)
python spaces">
python spaces">
python keyword">def
python plain">show_like(
python color1">self
python plain">, response):
python spaces">
python functions">print
python plain">(response.text)
python spaces">
|
执行成功response.text就会返回该结果
{"result":{"code":"9999", "message":"推荐成功", "data":{"jid":"cdu_53059370687","likedTime":"1530598017650000","lvCount":"24","nick":"古明地盆","uvCount
":"2921","voteTime":"小于1分钟前"}}}
{"result":{"code":"9999", "message":"推荐成功", "data":{"jid":"cdu_53059370687","likedTime":"1530598017657000","lvCount":"34","nick":"古明地盆","uvCount
":"2921","voteTime":"小于1分钟前"}}}
如果点赞成功之后继续执行,就会有如下提示
{"result":{"code":"30010", "message":"你已经推荐过了", "data":""}}
会发现,我只给当前页进行了点赞,如果我想给好多页进行点赞呢?
python">
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
|
python comments"># -*- coding: utf-8 -*-
python keyword">import
python plain">scrapy
python keyword">from
python plain">scrapy.http
python keyword">import
python plain">Request
python comments"># 我们如果要模拟登陆,就必须要拿到cookie
python comments"># response里面有url,text等等,但遗憾的是不想requests,可以直接拿到cookie
python comments"># 但我们可以导入一个模块
python keyword">from
python plain">scrapy.http.cookies
python keyword">import
python plain">CookieJar
python keyword">class
python plain">GetChoutiSpider(scrapy.Spider):
python spaces">
python plain">name
python keyword">=
python string">'get_chouti'
python spaces">
python plain">allowed_domains
python keyword">=
python plain">[
python string">'chouti.com'
python plain">]
python spaces">
python plain">start_urls
python keyword">=
python plain">[
python string">'https://dig.chouti.com/'
python plain">]
python spaces">
python plain">cookies
python keyword">=
python color1">None
python spaces">
python keyword">def
python plain">parse(
python color1">self
python plain">, response):
python spaces">
python comments"># 此时只是拿到了一个存储cookie的容器
python spaces">
python plain">cookie_obj
python keyword">=
python plain">CookieJar()
python spaces">
python comments"># response表示请求的所有内容,response.request表示我们发的请求
python spaces">
python comments"># 接受我们上面说的两个参数
python spaces">
python plain">cookie_obj.extract_cookies(response, response.request)
python spaces">
python comments"># 那么此时的cookie_obj便保存了我们的cookie信息
python spaces">
python functions">print
python plain">(cookie_obj._cookies)
python spaces">
python comments">'''
python spaces">
python comments">{'.chouti.com': {'/': {'gpsd': Cookie(version=0, name='gpsd', value='1c61978d6bb94989674386b29f2fd15d', port=None, port_specified=False, domain='.chouti
python spaces">
python comments">.com', domain_specified=True, domain_initial_dot=False, path='/', path_specified=True, secure=False, expires=1533183431, discard=False, comment=None, co
python spaces">
python comments">mment_url=None, rest={}, rfc2109=False)}}, 'dig.chouti.com': {'/': {'JSESSIONID': Cookie(version=0, name='JSESSIONID', value='aaaouDhGaca3Ugddzblrw', po
python spaces">
python comments">rt=None, port_specified=False, domain='dig.chouti.com', domain_specified=False, domain_initial_dot=False, path='/', path_specified=True, secure=False, e
python spaces">
python comments">xpires=None, discard=True, comment=None, comment_url=None, rest={}, rfc2109=False)}}}
python spaces">
python comments">'''
python spaces">
python comments"># 上面便是我们获取的cookie信息
python spaces">
python comments"># 将cookie保存起来
python spaces">
python color1">self
python plain">.cookies
python keyword">=
python plain">cookie_obj._cookies
python spaces">
python comments"># 同理request也一样
python spaces">
python comments">'''
python spaces">
python comments">类似于requests
python spaces">
python comments">res = requests.get(xxxxx)
python spaces">
python comments">res.cookies._cookies便是返回的cookie信息
python spaces">
python comments">'''
python spaces">
python comments"># 然后就要模拟登陆了,带上用户名和密码和cookie
python spaces">
python keyword">yield
python plain">Request(
python spaces">
python plain">url
python keyword">=
python string">'https://dig.chouti.com/login'
python plain">,
python spaces">
python plain">method
python keyword">=
python string">'POST'
python plain">,
python spaces">
python plain">headers
python keyword">=
python plain">{
python string">'content-type'
python plain">:
python string">'application/x-www-form-urlencoded; charset=UTF-8'
python plain">,
python spaces">
python string">'user-agent'
python plain">:
python string">'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'
python plain">},
python spaces">
python plain">cookies
python keyword">=
python color1">self
python plain">.cookies,
python spaces">
python plain">callback
python keyword">=
python color1">self
python plain">.check_login,
python spaces">
python comments"># 这里的body类似于requests的data,但是形式不一样,body不能够以字典的形式提交
python spaces">
python comments"># 账号密码输入的对的,这里隐藏了
python spaces">
python plain">body
python keyword">=
python string">'phone=8618xxxxx2459&password=zxxxxxhyyxx&oneMonth=1'
python spaces">
python plain">)
python spaces">
python comments"># 回调函数,用于检测请求是否发送成功。
python spaces">
python comments"># 注意回调函数不能是self.parse,否则回调执行的时候又把请求发过去了
python spaces">
python comments"># 里面自动封装了response,就是我们执行成功之后的响应结果
python spaces">
python keyword">def
python plain">check_login(
python color1">self
python plain">, response):
python spaces">
python functions">print
python plain">(response.text)
python spaces">
python comments">'''
python spaces">
python comments">{"result":{"code":"9999", "message":"", "data":{"complateReg":"0","destJid":"cdu_53059370687"}}}
python spaces">
python comments">'''
python spaces">
python comments"># 登陆成功
python spaces">
python comments"># 接下来进行点赞。
python spaces">
python comments"># 登陆页面不需要cookie
python spaces">
python comments"># 依旧yield
python spaces">
python keyword">yield
python plain">Request(
python spaces">
python plain">url
python keyword">=
python string">'https://dig.chouti.com/'
python plain">,
python spaces">
python plain">callback
python keyword">=
python color1">self
python plain">.like,
python comments"># 定义一个用于点赞的回调函数
python spaces">
python plain">)
python spaces">
python keyword">def
python plain">like(
python color1">self
python plain">, response):
python spaces">
python comments"># 此时的response则是整个页面
python spaces">
python plain">id_list
python keyword">=
python plain">response.xpath(
python string">'//div[@share-linkid]/@share-linkid'
python plain">).extract()
python spaces">
python keyword">for
python plain">nid
python keyword">in
python plain">id_list:
python spaces">
python plain">url
python keyword">=
python string">'https://dig.chouti.com/link/vote?linksId=%s'
python keyword">%
python plain">nid
python spaces">
python keyword">yield
python plain">Request(
python spaces">
python plain">url
python keyword">=
python plain">url,
python spaces">
python plain">method
python keyword">=
python string">'POST'
python plain">,
python spaces">
python plain">cookies
python keyword">=
python color1">self
python plain">.cookies,
python spaces">
python plain">headers
python keyword">=
python plain">{
python string">'referer'
python plain">:
python string">'https://dig.chouti.com/'
python plain">},
python spaces">
python comments"># 再加一个回调函数,查看是否点赞成功
python spaces">
python plain">callback
python keyword">=
python color1">self
python plain">.show_like
python spaces">
python plain">)
python spaces">
python comments"># 此时点赞只是当前页,点赞,如果我想给每一页都点赞
python spaces">
python comments"># 想给多页点赞,找到对应的页码
python spaces">
python plain">pages
python keyword">=
python plain">response.xpath(
python string">'//div[@id="dig_lcpage"]//a/@href'
python plain">).extract()
python spaces">
python keyword">for
python plain">page
python keyword">in
python plain">pages:
python spaces">
python plain">page_url
python keyword">=
python string">'https://dig.chouti.com%s'
python keyword">%
python plain">page
python spaces">
python keyword">yield
python plain">Request(
python spaces">
python plain">url
python keyword">=
python plain">page_url,
python spaces">
python comments"># 注意这里的callback,是函数本身
python spaces">
python comments"># 这里找到所有的页码,比如page_url此时处于第二页
python spaces">
python comments"># 那么再调用自己,就会给第二页点赞,点完赞之后,page_url就会变成第三页
python spaces">
python comments"># 那么再调用自己,就会给第三页点赞,以此往复
python spaces">
python plain">callback
python keyword">=
python color1">self
python plain">.like
python spaces">
python comments"># 抽屉貌似有一百二十多页,我们这里指定以下递归的深度,只爬取四个深度
python spaces">
python plain">)
python spaces">
python keyword">def
python plain">show_like(
python color1">self
python plain">, response):
python spaces">
python functions">print
python plain">(response.text)
|
第七页也被点赞了
第十页也被点赞了
既然如此,就玩一个疯狂的,给所有页面都点赞
将settings里面的DEPTH_LIMIT=4改成DEPTH_LIMIT=0,等于零表示无限查找
可以看到,一共120页,全点上了赞