使用requests+parsel+pandas+mysql完成的一个爬虫示例

一个爬虫示例

目标网址：链家二手房

使用到的库

requests 网络请求
parsel 网页解析
pandas 数据处理和存储
pymysql、dbutils 数据持久化

其中MySQL存储用的是一个封装好的API类

传送门: mysql-client-tools

说明

数据的存储会有两种模式，一种是直接保存为excel，一种是保存到MySQL数据库
请通过 write_type 来控制模式 excel or mysql
为了方便测试，最后会提供简单的DDL创建语句

代码示例

python">from fake_useragent import UserAgent
import requests
import pandas as pd
import re
import time
import datetime
from parsel import Selector
from mysql_client_tools import MySQLClient


class HomeLinkSpider(object):

	write_type = "excel"

	def __init__(self, db_config):
		self.ua = UserAgent()
		self.headers = {"User-Agent": self.ua.random}
		self.data = list()
		self.path_name = "浦东_三房_500_800万"
		self.url = "https://sh.lianjia.com/ershoufang/pudong/a3p5/"
		self.mc = MySQLClient(**db_config)

	def get_max_page(self):
		response = requests.get(self.url, headers=self.headers)
		if response.status_code == 200:
			# 创建Selector类实例
			selector = Selector(response.text)
			# 采用css选择器获取最大页码div Boxl
			a = selector.css('div[class="page-box house-lst-page-box"]')
			# 使用eval将page-data的json字符串转化为字典格式
			max_page = eval(a[0].xpath('//@page-data').get())["totalPage"]
			print("最大页码数:{}".format(max_page))
			# return max_page
			return 3 # 测试时先用3页
		else:
			print("请求失败 status:{}".format(response.status_code))
			return None

	def parse_page(self):
		max_page = self.get_max_page()
		for i in range(1, max_page + 1):
			url = 'https://sh.lianjia.com/ershoufang/pudong/pg{}a3p5/'.format(i)
			response = requests.get(url, headers=self.headers)
			selector = Selector(response.text)
			ul = selector.css('ul.sellListContent')[0]
			li_list = ul.css('li')
			for li in li_list:
				detail = dict()
				detail['title'] = li.css('div.title a::text').get()
				#  2室1厅 | 74.14平米 | 南 | 精装 | 高楼层(共6层) | 1999年建 | 板楼
				house_info = li.css('div.houseInfo::text').get()
				house_info_list = house_info.split(" | ")

				detail['bedroom'] = house_info_list[0]
				detail['area'] = house_info_list[1]
				detail['direction'] = house_info_list[2]

				floor_pattern = re.compile(r'\d{1,2}')
				match1 = re.search(floor_pattern, house_info_list[4])  # 从字符串任意位置匹配
				if match1:
					detail['floor'] = str(match1.group())
				else:
					detail['floor'] = "未知"

				# 匹配年份
				year_pattern = re.compile(r'\d{4}')
				match2 = re.search(year_pattern, house_info_list[5])
				if match2:
					detail['year'] = str(match2.group())
				else:
					detail['year'] = "未知"

				# 文兰小区 - 塘桥    提取小区名和哈快
				position_info = li.css('div.positionInfo a::text').getall()
				detail['house'] = position_info[0]
				detail['location'] = position_info[1]

				# 650万，匹配650
				price_pattern = re.compile(r'\d+')
				total_price = li.css('div.totalPrice span::text').get()
				detail['total_price'] = str(re.search(price_pattern, total_price).group())

				# 单价64182元/平米， 匹配64182
				unit_price = li.css('div.unitPrice span::text').get()
				detail['unit_price'] = str(re.search(price_pattern, unit_price).group())

				# 图片
				pic_code = li.css('a.noresultRecommend ::attr(data-housecode)').get()
				pic_url = li.css('a.noresultRecommend img.lj-lazy ::attr(data-original)').get()
				detail['pic_code'] = pic_code
				detail['pic_url'] = pic_url
				# 把图片保存到本地
				# response = requests.get(pic_url)
				# pic_name = pic_code + '.jpg'
				# with open("./assets/" + pic_name, 'wb') as f:
				# 	f.write(response.content)

				if self.write_type == 'excel':
					self.data.append(detail)
				elif self.write_type == 'mysql':
					self.write_to_mysql(detail)
				else:
					print("No write type specified.")

	def write_to_file(self):
		if self.write_type != 'excel':
			return

		columns = ["标题", "房厅", "面积", "朝向", "楼层", "年份", "小区", "位置", "总价(万)", "单价(元/平方米)", "图像id", "图像地址"]
		keys = ["title", "house", "bedroom", "area","direction", "floor", "year", "location", "total_price", "unit_price", "pic_code", "pic_url"]
		try:
			print("start write to file...")
			# df = pd.DataFrame(data=[[]], columns=columns) # 二维数组形式
			df = pd.DataFrame(self.data) # 字典形式
			df.columns = columns
			t = datetime.datetime.now()
			# 请先安装 pip install pandas xlsxwriter
			writer =  pd.ExcelWriter(path=self.path_name + '_%02d%02d%02d.xlsx' % (t.hour, t.minute, t.second), engine="xlsxwriter")
			df.to_excel(writer, sheet_name='Sheet1', encoding='utf8', header=True, index=False, startcol=0, startrow=0)
			writer.save()
			# writer.close()
			print("Write a file to path name %s Successful." % self.path_name)
		except Exception as e:
			print("Fail to write file to path name: %s, Case: %s" % (self.path_name, e))

	def write_to_mysql(self, detail):
		table = "lian_jia_data"
		sql = '''select id from {} where pic_code=%s'''.format(table)
		result = self.mc.execute(sql, detail["pic_code"])
		if result:
			print("this data has exist! the pic code is ", detail["pic_code"])
			return
		self.mc.insert(table, detail)
		print("insert to mysql successful! data is ", detail)


if __name__ == '__main__':
	db_config = {
		"host": "127.0.0.1",
		"port": 3306,
		"user": "root",
		"password": "pwd",
		"database": "data",
	}
	start = time.time()
	home_link_spider = HomeLinkSpider(db_config=db_config)
	home_link_spider.write_type = "excel"
	home_link_spider.parse_page()
	home_link_spider.write_to_file()
	end = time.time()
	print("耗时：{}秒".format(end - start))

数据库DDL


CREATE TABLE `lian_jia_data` (
  `id` int(11) NOT NULL AUTO_INCREMENT,
  `title` varchar(100) DEFAULT NULL,
  `house` varchar(100) DEFAULT NULL,
  `bedroom` varchar(100) DEFAULT NULL,
  `area` varchar(100) DEFAULT NULL,
  `direction` varchar(100) DEFAULT NULL,
  `floor` varchar(100) DEFAULT NULL,
  `year` varchar(100) DEFAULT NULL,
  `location` varchar(100) DEFAULT NULL,
  `total_price` varchar(100) DEFAULT NULL,
  `unit_price` varchar(100) DEFAULT NULL,
  `pic_code` varchar(100) DEFAULT NULL,
  `pic_url` varchar(100) DEFAULT NULL,
  PRIMARY KEY (`id`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci