获取肖像数据集
- 前言
- 代码
前言
在《基于人脸识别和图像分割技术制作证件照》博文中介绍了如何使用deeplab图像分割模型制作证件照,在《基于PASCAL VOC 2012数据集训练deeplab图像分割模型》博文中介绍了如何调优模型,在《使用labelme工具标注人像数据集》博文中介绍了如何标注人像数据集,今天教大家获取肖像数据。
代码
详情参考github
python">import requests
import urllib
from lxml import etree, html
import time
import os
def crawl_person_image_url(page: int = 1, header=header):
url = f'https://www.vcg.com/creative-image/xiaoxiang/?creativeRace=1&page={page}'
req: requests.Response = requests.get(url=url, headers=header)
src = req.text
root = etree.HTML(src)
figures = root.xpath(xpath) # 获取图像列表
figure_url_list = []
for fg in figures:
try:
img = fg.xpath('a/img')[0] # 获取图像元素
data = ('https:' + img.get('data-src'), 'https:' + img.get('data-min'))
figure_url_list.append(data)
except e:
continue
return figure_url_list
def download_image(url: str, save_path='./vcg/'):
os.makedirs(save_path, exist_ok=True)
image_name = url.split('/')[-1]
full_save_file_name = save_path + image_name
data = requests.get(url).content
with open(full_save_file_name, 'wb') as f:
f.write(data)
return full_save_file_name
def main():
for i in range(1, MAX_PAGE + 1):
urls = crawl_person_image_url(i)
for u in urls:
download_image(u[0])
time.sleep(0.1) # 防止过快