参考:
Python threading实现多线程 原理 基础篇
python 多线程爬取网站图片(详解)
【Python】threading控制线程的数量
1.读取url列表,读取文件夹已经存在文件个数,得到未爬虫列表
2.多线程爬取 、存储
不固定线程数量
python">import time
import datetime
import os
import requests
import threading
import pandas as pd
from tqdm import tqdm
from queue import Queue
url_queue = Queue() #队列
pic_folder = './crawl_pic/'
pic_file = pic_folder + 'pic.csv'
error_pic_file = pic_folder + 'pic_error.txt'
pic_url = pd.read_csv(pic_file)
pic_url = pic_url['url'].values
files = os.listdir(pic_folder) # 读入文件夹
num_png = len(files) # 统计文件夹中的文件个数
def download_pic():
'''
下载图片
:param url:
:return:
'''
url = url_queue.get()
try:
response = requests.get(url)
# 获取的文本实际上是图片的二进制文本
img = response.content
time.sleep(3)
except Exception as ex:
print("--------出错继续----")
with open(error_pic_file, 'a+', encoding='utf-8') as f:
f.write(url + '\n')
else:
with open(path, 'wb') as f:
f.write(img)
finally:
url_queue.task_done()
for url in pic_url[num_png:]:
url_queue.put(url)# 放入队列
for url in pic_url[num_png:]:
t_url = threading.Thread(target= download_pic())
t_url.start() # 启动线程
url_queue.join() #等待实例完成
固定线程数量
python">import time
import datetime
import os
import requests
import threading
import pandas as pd
from tqdm import tqdm
from queue import Queue
url_queue = Queue() #队列
pic_folder = './crawl_pic/'
pic_file = pic_folder + 'pic.csv'
error_pic_file = pic_folder + 'pic_error.txt'
pic_url = pd.read_csv(pic_file)
pic_url = pic_url['url'].values
files = os.listdir(pic_folder) # 读入文件夹
num_png = len(files) # 统计文件夹中的文件个数
def download_pic(url):
'''
下载图片
:param url:
:return:
'''
pool_sema.acquire() # 加锁,限制线程数
try:
response = requests.get(url)
# 获取的文本实际上是图片的二进制文本
img = response.content
time.sleep(3)
except Exception as ex:
print("--------出错继续----")
with open(error_pic_file, 'a+', encoding='utf-8') as f:
f.write(url + '\n')
else:
with open(path, 'wb') as f:
f.write(img)
finally:
pool_sema.release() # 解锁
max_connections = 2 # 定义最大线程数
pool_sema = threading.BoundedSemaphore(max_connections) # 使用Semaphore方法
thread_list = []
for url in pic_url[num_png:]: # 12个任务
t = threading.Thread(target=download_pic, args=[url])
thread_list.append(t)
for t in thread_list:
t.start() # 调用start()方法,开始执行
for t in thread_list:
t.join() # 子线程调用join()方法,使主线程等待子线程运行完毕之后才退出