Python抓取网页中的动态序列化数据

Python抓取网页中的动态序列化数据
动态序列化数据经常应用于前后端分离的页面、或者通过VUE、JS等HTML页面环境，常规的爬虫抓取方法并不能满足数据采集的要求，因此需要其他的方式进行数据的采集。

这里介绍的2个实例仅限于基础内容的演示，考虑到爬虫效率的问题都会使用不同的爬虫框架来规划爬虫的项目。

Ajax 实例

Ajax页面解析
现在越来越多的网页的原始HTML文档不包括任何数据，而是采用Ajax统一加载。发送Ajax请求道网页更新的过程：

发送请求。
解析内容。
渲染网页。

打开浏览器的开发者工具，到 Networkk 选项卡，使用 XHR 过滤工具。需要按照对应 all_config_file.py 文件建立对应相应文件夹修改该配置并且开启相关服务。
打开浏览器的开发者工具，到 Networkk 选项卡，使用 XHR 过滤工具。需要按照对应 all_config_file.py 文件建立对应相应文件夹修改该配置并且开启相关服务。

完整代码实例
all_config_file.py 配置文件

python"># coding=utf-8
__author__ = 'Mr数据杨'
__explain__ = '各目标网站爬虫脚本配置文件'

#加载引用模块
import time
import pymongo
import pandas as pd

def news_page_num():
    page_num=input("输入每个网站页面爬取的页面数:")
    return int(page_num)

def title_error_num():
    title_error_num=input("输入错误标题爬取最大数:")
    return int(title_error_num)

def body_error_num():
    body_error_num=input("输入错误页面爬取最大数:")
    return int(body_error_num)


def mongodb_client():
    # 获取mongoClient对象
    client = pymongo.MongoClient("localhost", 27017)
    # 获取使用的database对象
    db = client.news
    print("加载MongoDB数据库完毕......")
    return db

db=mongodb_client()

def time_today():
    # 全局函数
    time_today = time.strftime('%Y-%m-%d', time.localtime(time.time()))
    print("加载全局日期函数完毕......")
    return time_today


# 错误日志信息
def error_text_title(text,time_today):
    print("加载错误信息日志完毕......")
    with open("logs/" + time_today + " news_title_error.txt", "a") as f:
        f.write(text + '\n')

# 错误日志信息
def error_text_body(text,time_today):
    with open("logs/" + time_today + " news_body_error.txt", "a") as f:
        f.write(text + '\n')

# 找到每个爬取网页的链接
def get_title_links_from_MongoDB(label, type):
    result = []
    for item in db.news_tmp.find({'label': label, 'type': type}, {'url': 1, '_id': 1}):
        result.append(item)
    result = pd.DataFrame(result, columns=['url', '_id'])
    return result

爬虫主程序

python"># 加载引用模块
import urllib
import urllib.request
import requests
import datetime
from bs4 import BeautifulSoup
import all_config_file
from all_config_file import error_text_title
from all_config_file import error_text_body
from all_config_file import get_title_links_from_MongoDB

cqcoal = "http://news.cqcoal.com/manage/newsaction.do?method:webListPageNewsArchivesByTypeid"
print("加载目标网址完毕......")
db = all_config_file.mongodb_client()
time_today = all_config_file.time_today()
def cqcoal_title_start(num):
    def start_type(url, label, typeid, pagenum, type):
        try:
            page_num = 1
            while page_num <= pagenum:
                print("开始爬取：" + url)
                page_num += 1
                user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
                headers = {'User-Agent': user_agent}
                req = urllib.request.Request(url, headers=headers)
                response = requests.get(url, headers=headers, timeout=10)
                post_param = {'pageNum': pagenum, 'pageSize': '20', 'jsonStr': typeid}
                #             post_param = post_param.format(typeid)
                return_data = requests.post(url, data=post_param, verify=False)
                content = return_data.text
                # print(content)
                if label == 'news.cqcoal.com':
                    one_page = get_cqcoal_page_news(content, type)
                    print('新闻抓取完毕')
        except:
            error = str(url+ " label:" + label + " gd:" + str(typeid) + " pagenum:" + str(pagenum) + " type:" + type + ' 未抓取到')
            error_text_title(error, time_today)
            print (error)
    def get_cqcoal_page_news(content, type):
        l = content.split("},{")
        for i in [*range(len(l))]:
            url = "http://news.cqcoal.com/blank/nc.jsp?mid=" + l[i][l[i].find("id") + 4:l[i].find("typeid") - 2]
            title = l[i][l[i].find("title") + 8:l[i].find("shorttitle") - 3]
            typename = l[i][l[i].find("typename") + 11:l[i].find("typeid2") - 3]
            timeStamp = l[i][(l[i].find("pubdate") + 10):(l[i].find("senddate") - 3)]
            description = l[i][l[i].find("description") + 14:l[i].find("filename") - 3]
            timeStamp = int(timeStamp)
            dateArray = datetime.datetime.utcfromtimestamp(timeStamp)
            pubdate = dateArray.strftime("%Y-%m-%d")
            one_page = {'title': title, 'url': url, 'date': pubdate, 'type': type, 'label': 'news.cqcoal.com'}
            db.news_tmp.insert_one(one_page)
        return one_page
    ###news.cqcoal.com
    def start_Cqcoal_supply_and_demand():
        start_type(cqcoal, 'news.cqcoal.com', '{"typeid":"238"}', num, 'supply_and_demand')
    def start_Cqcoal_price():
        start_type(cqcoal, 'news.cqcoal.com', '{"typeid":"234"}', num, 'price')
    def start_Cqcoal_dynamic():
        start_type(cqcoal, 'news.cqcoal.com', '{"typeid":"235"}', num, 'dynamic')
    def start_Cqcoal_international():
        start_type(cqcoal, 'news.cqcoal.com', '{"typeid":"236"}', num, 'international')
    def start_Cqcoal_comment():
        start_type(cqcoal, 'news.cqcoal.com', '{"typeid":"14"}', num, 'comment')
    def start_Cqcoal_transportation():
        start_type(cqcoal, 'news.cqcoal.com', '{"typeid":"239"}', num, 'transportation')
    def start_Cqcoal_economics():
        start_type(cqcoal, 'news.cqcoal.com', 'road_price}', num, 'economics')
    def start_Cqcoal_policy():
        start_type(cqcoal, 'news.cqcoal.com', '{"typeid":"230"}', num, 'policy')
    def start_Cqcoal_correlation():
        start_type(cqcoal, 'news.cqcoal.com', '{"typeid":"237"}', num, 'correlation')
    def start_Cqcoal_expert():
        start_type(cqcoal, 'news.cqcoal.com', '{"typeid":"232"}', num, 'expert')
    start_Cqcoal_transportation()
    start_Cqcoal_supply_and_demand()
    start_Cqcoal_price()
    start_Cqcoal_policy()
    start_Cqcoal_international()
    start_Cqcoal_expert()
    start_Cqcoal_economics()
    start_Cqcoal_dynamic()
    start_Cqcoal_correlation()
    start_Cqcoal_comment()
def cqcoal_body_start():
    def get_new_body(label, type):
        link_list = get_title_links_from_MongoDB(label, type)
        if label == 'news.cqcoal.com':
            try:
                for url in link_list['url']:
                    news_body, news_body_1 = get_news_Cqcoal_text(url,label, type)
                    if news_body is not None:
                        db.news_tmp.update({'url': url}, {"$set": {'newsbody': news_body}})
                        db.news_tmp.update({'url': url}, {"$set": {'newsbody_1': news_body_1}})
                print("网站：" + label + "    类型：" + type + "内容爬取完毕!")
            except:
                error = str(url + "   error:" + '    label:' + label + "    type：" + type)
                #error_text(error)
                print(error)

    def get_news_Cqcoal_text(url,label, type):
        # html = urllib.request.urlopen(url,timeout=5)
        id = url.split('=')[1]
        url = 'http://news.cqcoal.com/manage/newsaction.do?method:getNewsAddonarticle'
        post_param = {'id': id}
        # return_data = requests.post(url,data =post_param, verify = False)
        try:
            return_data = requests.post(url, data=post_param, verify=False, timeout=120)
        except:
            print("error    label:", url, " Time out!")
            error = str(url + "   error:" + '    label:' + label + "    type：" + type)
            error_text_body(error, time_today)
            return None, None
        return_data = return_data.text
        try:
            newsBody = return_data[return_data.find("body") + 7:return_data.find("xh") - 3]
            newsBody_1 = return_data[return_data.find("body") + 7:return_data.find("xh") - 3]
            print(url + "  记录爬取完毕")
            return newsBody, newsBody_1
        except:
            print("error    label:", url, "   type:", type)
            error = str(url + "   error:" + '    label:' + label + "    type：" + type)
            error_text_body(error, time_today)
            return None, None

    ###news.cqcoal.com
    def start_body_Cqcoal_transportation():
        get_new_body('news.cqcoal.com', 'transportation')
    def start_body_Cqcoal_supply_and_demand():
        get_new_body('news.cqcoal.com', 'supply_and_demand')
    def start_body_Cqcoal_price():
        get_new_body('news.cqcoal.com', 'price')
    def start_body_Cqcoal_policy():
        get_new_body('news.cqcoal.com', 'policy')
    def start_body_Cqcoal_international():
        get_new_body('news.cqcoal.com', 'international')
    def start_body_Cqcoal_expert():
        get_new_body('news.cqcoal.com', 'expert')
    def start_body_Cqcoal_dynamic():
        get_new_body('news.cqcoal.com', 'dynamic')
    def start_body_Cqcoal_economics():
        get_new_body('news.cqcoal.com', 'economics')
    def start_body_Cqcoal_correlation():
        get_new_body('news.cqcoal.com', 'correlation')
    def start_body_Cqcoal_comment():
        get_new_body('news.cqcoal.com', 'comment')
    start_body_Cqcoal_transportation()
    start_body_Cqcoal_supply_and_demand()
    start_body_Cqcoal_price()
    start_body_Cqcoal_policy()
    start_body_Cqcoal_international()
    start_body_Cqcoal_expert()
    start_body_Cqcoal_economics()
    start_body_Cqcoal_dynamic()
    start_body_Cqcoal_correlation()
    start_body_Cqcoal_comment()

JS 实例

JS页面解析
浏览器渲染引擎：渲染引擎的职责就是渲染，即在浏览器窗口中显示所请求的内容。浏览器向服务器发送请求，得到服务器返回的资源文件后，经过需要渲染引擎的处理，将资源文件显示在浏览器窗口中。

目前使用较为广泛的渲染引擎有两种：

webkit——使用者有Chrome, Safari Geoko——使用者有Firefox

渲染主流程：渲染引擎首先通过网络获得所请求文档的内容，通常以8K分块的方式完成。下面是渲染引擎在取得内容之后的基本流程：

解析html以构建dom树 -> 构建render树 -> 布局render树
->绘制render树渲染引擎开始解析html，并将标签转化为内容树中的dom节点。
如果遇到JS，那么此时会启用单独连接进行下载，并且在下载后进行解析。
接着，它解析外部CSS文件及style标签中的样式信息。这些样式信息以及html中的可见性指令将被用来构建另一棵树——render树。
Render树由一些包含有颜色和大小等属性的矩形组成，它们将被按照正确的顺序显示到屏幕上。
Render树构建好了之后，将会执行布局过程，它将确定每个节点在屏幕上的确切坐标。
再下一步就是绘制，即遍历render树，并使用UI后端层绘制每个节点

渲染动态网页，有两种选择：

自己从头实现一个浏览器渲染引擎，在合适的时机返回构建的dom树或render树。
这需要进行大量的工作，需要考虑html、js、css等不同格式文件的解析方式以及解析顺序等。

完整代码实例

爬虫主程序

python">import warnings
warnings.filterwarnings("ignore")
import time
import requests
import pymongo
import pandas as pd
import re
from lxml import etree

# 获取mongoClient对象
client = pymongo.MongoClient("localhost", 27017)
# 获取使用的database对象
db = client.news
today=time.strftime('%Y.%m.%d',time.localtime(time.time()))

def main(page_num):
    #开始爬取数据设定抓取页面数字
    n=int(page_num)

    def start_crawler(pro,col,adress):
        i=1
        while i<= n:
            t = time.time()
            url = "https://36kr.com/api/search-column/{}?per_page=20&page={}&_=".format(pro,i)+str(int(t))
            i+=1
            post_param = {'per_page':'20',\
                          'page':str(i),\
                          '_':int(t)}
            time.sleep(2)
            user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
            headers = { 'User-Agent' : user_agent }
            return_data = requests.get(url,data =post_param, verify = False)
            one_page = get_news_link_to_mongodb(return_data,col,url,adress)
        print (adress+" "+col+' 新闻标题抓取完毕')

    def get_news_link_to_mongodb(return_data,col,url,adress):
        content = return_data.json().get('data').get('content')
        for i in return_data.json().get('data').get('items'):
            one_page = {'title': i["title"], \
                        'url_html': "https://36kr.com/p/"+str(i["id"])+".html",\
                        'url_json': "https://36kr.com/p/"+str(i["id"])+".json",\
                        'summary':i["summary"],\
                        'tags': re.sub(r'[0-9"\[\]]','',i["extraction_tags"]).replace(",,"," ").replace(",",""),\
                        'label':col,\
                        'adress':adress,\
                        'work_date':time.strftime('%Y.%m.%d',time.localtime(time.time()))}
            db.kr36.insert_one(one_page)
        print("已经完成抓取 "+adress+" "+col+" 板块页面："+url)

    def news_body_start(label,adress,today):
        url_list = []
        for item in db.kr36.find({'label': label,'adress':adress,'work_date':today}, {'url_json': 1}):
            url_list.append(item)
        url_list = pd.DataFrame(url_list, columns=['url_json'])
        for i in url_list["url_json"]:
            html = requests.get(i)
            a=html.json().get('props').get('detailArticle|post').get('content')
            sel = etree.HTML(a)
            clear_content = sel.xpath('string(//*)')
            db.kr36.update({'url_json': i}, {"$set": {'newsbody': clear_content}})
            print(i+"  抓取完毕")
        print (adress+" "+label+" "+"新闻主体抓取完毕")

    start_crawler('23','大公司',"36kr")
    start_crawler('221','消费',"36kr")
    start_crawler('225','娱乐',"36kr")
    start_crawler('218','前沿技术',"36kr")
    start_crawler('219','汽车交通',"36kr")
    start_crawler('208','区块链',"36kr")
    start_crawler('103','技能get',"36kr")

    news_body_start("大公司","36kr",today)
    news_body_start("消费","36kr",today)
    news_body_start("娱乐","36kr",today)
    news_body_start("前沿技术","36kr",today)
    news_body_start("汽车交通","36kr",today)
    news_body_start("区块链","36kr",today)
    news_body_start("技能get","36kr",today)

    #导出模块
    name = {'adress':'36kr'}
    search_res = db.kr36.find(name)
    list_=[]
    for i in search_res:
        list_.append(i)
    ddf=pd.DataFrame(list_,columns=["title","url_html","tags","labels","adress","summary","newsbody"])
    ddf.to_csv("36氪首页news.csv",encoding="utf_8_sig")

if __name__ == '__main__':
    page_num=input("输入需要抓取的页数")
    main(page_num)