利用post请求爬取动态评论

news/2024/7/19 12:44:39 标签: python, 爬虫, json

任务：利用post请求爬取动态评论

实现：

酒店网站的评论是动态的，是实时更新的，所以在网页源代码里只有一部分，所以要想得到所有的就得在网页的检查元素里（我的是火狐浏览器），右击检查元素，如下图

一般get请求就可以得到网页源代码里边我们想要的，但是不能得到动态信息，所以要用post请求，此时我们需要参数里面的listRequest.hotelIDs,这是为什么呢？因为在网页源代码里，每家酒店是以不同的data-hotelid来区分,每个酒店都有不同的data-hotelid，post请求可以得到如下

代码如下

#使用的库

import urllib
import requests
import time
from bs4 import BeautifulSoup
import sys
reload(sys)
sys.setdefaultencoding( "utf-8" )

#这个请求头其实不用写这么多，最重要是User-Agent必须要有

headers={'Host': 'hotel.elong.com',
'Connection': 'keep-alive',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:56.0) Gecko/20100101 Firefox/56.0',
'Upgrade-Insecure-Requests':'1',
'Accept-Encoding': 'gzip, deflate',
'Content-Type': 'application/json;charset=UTF-8',
'Accept-Language': 'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3',
}

#用一个循环来得到hotelids

while True:
   d=requests.post(url,data=data,headers=headers)
        j = json.loads(d.text)
        p=j['value']
        q=p['hotelIds']
        q=q.split(",")
        list.append(q)
        data['listRequest.pageIndex'] += 1
        if data['listRequest.pageIndex']==2:
            break
list=sum(list,[])
#这个list里都是hotelids
print list

#接下来就可以定义一个函数来抓取评论了
def get_information():
    list1=[]
    list2=[]
    for q in list:
        z='http://hotel.elong.com/'+q
        url1 = 'http://hotel.elong.com/ajax/detail/gethotelreviews/?hotelId=' + q
        list1.append(z)
        list2.append(url1)
    for p in list1:
        page=urllib.urlopen(p)
        soup = BeautifulSoup(page,"html.parser")
        for tag in soup.find_all('h1',class_="breadcrumb"):
            f= tag.get_text(strip=True)
            print f
            file = open(f +".txt","w")
        for tag in soup.find_all('div',id="hotelContent"):
            ff= tag.get_text()
            ff=ff.replace(" ","")
            file.write(ff)
    for q in list:
        url1 = 'http://hotel.elong.com/ajax/detail/gethotelreviews/?hotelId=' + q
        for i in range(0,4):
            url2=url1+'&recommendedType=0&pageIndex='+str(i)
            list2.append(url2)
    print list2
    for p in list2:
        print p
        request=urllib2.Request(p)
        response=urllib2.urlopen(request)
        t=response.read()
        d = json.loads(t)
        for p in  d["contents"]:
            content = p["content"].encode('utf-8')
            time = p["createTimeString"].encode('utf-8')
            print time
            file.write("评价内容"+'\n'+content+'\n'+"评论时间:"+time+'\n')
            time.sleep(1)