为什么80%的码农都做不了架构师?>>>
python">'''
用lxml取文本内容
string(.):"取所有文本"
split():"不给参数,默认按空格切割"
join():"将列表转为字符串"
'''
from requests_html import HTMLSession
import requests
from simple_spider_rule.get_charset import pick_charset
from lxml import etree
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/"
"537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36"
}
class RequLxml(object):
def __init__(self,url):
self.url = url
def re_render(self):
try:
with HTMLSession() as sess:
res = sess.get(url=self.url, headers = headers)
return res.html
except Exception as e:
print("请求出错,一个错误的网站",e)
def res_quest(self):
try:
res= requests.get(url=self.url, headers=headers)
return res
except Exception as e:
print(e)
def get_return_data(self):
res = self.res_quest()
charset = pick_charset(res.text)
res.encoding = charset
response = res.text
return response
def lxml_data(self):
data = self.get_return_data()
selector_cdata = etree.HTML(data)
title = selector_cdata.xpath("//h1[@id='chan_newsTitle']")[0].text
content = selector_cdata.xpath("//div[@id='chan_newsDetail']")[0]
content = content.xpath("string(.)").split()
ctime = selector_cdata.xpath("//div[@id='chan_newsInfo']")[0]
ctime = ctime.xpath("string(.)").split()[5:7]
print("title:",title)
print("ctime:", " ".join(ctime))
print("content:", " ".join(content))
def main(self):
self.lxml_data()
if __name__ == '__main__':
url = "https://news.china.com/socialgd/10000169/20190124/35071107.html"
r = RequLxml(url=url)
r.main()
python">import re
def pick_charset(html):
"""
从任意网页返回数据中的文本中提取 meta charset
:param html:
:return:
"""
charset = None
m = re.compile('<meta .*(http-equiv="?Content-Type"?.*)?charset="?([a-zA-Z0-9_-]+)"?', re.I).search(html)
if m and m.lastindex == 2:
charset = m.group(2).lower()
return charset
python">'''
用dom树取文本内容
'''
from requests_html import HTMLSession
import requests
from simple_spider_rule.get_charset import pick_charset
from pyquery import PyQuery as pq
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/"
"537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36"
}
class RequLxml(object):
def __init__(self,url):
self.url = url
def re_render(self):
try:
with HTMLSession() as sess:
res = sess.get(url=self.url, headers = headers)
return res.html
except Exception as e:
print(e)
def res_quest(self):
try:
res= requests.get(url=self.url, headers=headers)
return res
except Exception as e:
print(e)
def get_return_data(self):
res = self.res_quest()
charset = pick_charset(res.text)
res.encoding = charset
response = res.text
return response
def handle_space(self,parameter):
adata = " ".join(parameter.split())
return adata
def get_data(self):
res = self.get_return_data()
dom = pq(res)
title = dom("#chan_newsTitle").text()
ctime = dom("#chan_newsInfo").text()
content = dom("#chan_newsDetail").text()
print("title:",self.handle_space(title))
print("ctime:",self.handle_space(ctime))
print("content:",self.handle_space(content))
def run(self):
self.get_data()
if __name__ == '__main__':
url = "https://news.china.com/socialgd/10000169/20190124/35071107.html"
r = RequLxml(url=url)
r.run()
python">'''
yeild
注:python3和python2 的生成器用法不同了,python3用的是__next__(),python2用的是next()
'''
import queue
class Yeild_Queue(object):
def __init__(self,clist:list):
self.q = queue.Queue()
self.clist = clist
def save_data(self):
for data in self.clist:
self.q.put(data)
return self.q
def get_data(self):
self.save_data()
while not self.q.empty():
cdata = self.q.get()
yield cdata
def run(self):
a = self.get_data()
for b in range(len(self.clist)):
print(a.__next__())
# print(dir(a))
if __name__ == '__main__':
alist = [3,6,1,9,4,2]
yq = Yeild_Queue(alist)
yq.run()