爬取网页
req = urllib.request.Request(url)
req.add_header('user-agent',
'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36')
response = urllib.request.urlopen(req)
html = response.read()
res = requests.get(img_url, headers = headers)
resp.text // 返回的是一个经过解码后的字符串,是unicode类型
resp.content // 返回的是一个原生字符串,是bytes类型
解析网页
soup = BeautifulSoup(html, 'html.parser')
imgs = soup.select('.slist')[0]
imgs1 = imgs.find_all('img')
imgs2 = imgs.find_all('a')
imag_urls = []
imag_titles = []
for img in imgs1:
imag_titles.append(img.get("alt"))
for img in imgs2:
imag_urls.append( s_url + img.get('href'))
return imag_titles, imag_urls
打印一下 soup 对象的内容,格式化输出
print soup.prettify()
imgs = soup.select('.slist')[0]#获取class slist
imgs1 = imgs.find_all('img')#获取标签 img
imag_titles.append(img.get("alt"))#获取属性alt
下载文件
path = './out/qc_picture/'
if not os.path.exists(path):
os.makedirs(path)
res = requests.get(img_url, headers = headers)
if res.status_code == 200:
with open(path + name + str(page) + "页.jpg", "wb") as f:
f.write(res.content)
print(name + "-----下载完成")
注意事项
socket.setdefaulttimeout(1) #下载超过1s就跳过
res.close() #关闭请求
time.sleep(0.1)# 等待0.1s
实例
import urllib.request
import os
import requests
import time
import socket
from bs4 import BeautifulSoup
socket.setdefaulttimeout(1)
headers = {'Referer':'https://www.mzitu.com','User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3679.0 Safari/537.36'}#请求头,加上referer突破防盗链
url1 = "https://www.mzitu.com/xinggan"
url = "https://www.mzitu.com/mm"
def get_url(url):
req = urllib.request.Request(url)
req.add_header('user-agent',
'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36')
response = urllib.request.urlopen(req)
html = response.read()
return html.decode("utf-8")#网站utf-8编码
def get_images(html):
soup = BeautifulSoup(html, 'html.parser')
imgs = soup.select('.postlist')[0]
imgs1 = imgs.find_all('img')
imgs2 = imgs.find_all('a')
imag_urls = []
imag_titles = []
k = 0
for img in imgs1:
imag_titles.append(img.get("alt"))
for img in imgs2:
if k % 2:
imag_urls.append(img.get("href"))
k += 1
return imag_titles, imag_urls
def get_img(url):
html2 = get_url(url)
soup = BeautifulSoup(html2,"html.parser")
soup = soup.select('.main-image')
soup = soup[0].find_all("img")
soup = soup[0].get("src")
return soup
def save_imgs(img_url, page, name):
path = './out/qc_picture/'
if not os.path.exists(path):
os.makedirs(path)
res = requests.get(img_url, headers = headers)
if res.status_code == 200:
with open(path + name + str(page) + "页.jpg", "wb") as f:
f.write(res.content)
print(name + "-----下载完成")
res.close()
time.sleep(0.1)
def download(l, r):
sum = 0
for i in range(l-1, r):
url_p = url + "/page/" + str(i + 1) + "/"
html = get_url(url_p)
list1, list2 = get_images(html)
sum += len(list1)
for j in range(len(list2)):
url_img = get_img(list2[j] + "/1")
save_imgs(url_img, i + 1, list1[j])
print("一共下载" + str(sum) + "张图片")
download(5, 5)