python3爬虫例子01(获取个人博客园的粉丝)

news/2024/7/19 12:15:25 标签: 爬虫, python
#!/usr/bin/env python
# -*- coding:UTF-8 -*-

import requests
from selenium import webdriver
from bs4 import BeautifulSoup
import re
import time


class GetFansName:
#初始化各配置项数值
def __init__(self, profiles, url, ses, sleepTime, fansNameFile):
self.profiles = profiles
self.url = url
self.ses = ses
self.sleepTime = sleepTime
self.fansNameFile = fansNameFile

def get_cookies(self):
try:
#加载配置文件
profiles = webdriver.FirefoxProfile(self.profiles)

driver = webdriver.Firefox(profiles)

driver.get(self.url+"/followers")

time.sleep(self.sleepTime)

#获取COOKIES
cookies = driver.get_cookies()

# print(cookies)

driver.quit()

return cookies
except Exception as msg:
print("get_cookies error:%s"%str(msg))


def add_cookies(self,cookies):
try:
c=requests.cookies.RequestsCookieJar()
for i in cookies:
c.set(i["name"],i["value"])

#更新COOKIES
self.ses.cookies.update(c)
except Exception as msg:
print("add_cookies error:%s"%str(msg))


def get_fansNum(self):
try:
#发送访问粉丝的请求
fansres = self.ses.get(self.url+"/relation/followers")

fanssoup = BeautifulSoup(fansres.content,"html.parser")

#获取粉丝数量
tempfansnum = fanssoup.find_all(class_="current_nav")

# print(tempfansnum[0].string)

strfansnum = re.findall(u"我的粉丝\((.+?)\)",tempfansnum[0].string)
print(u"我的粉丝数量:%s"%str(strfansnum[0]))

#粉丝分页数量
fansnum = int(int(strfansnum[0])/45)+1

print(u"总的分页:%s"%str(fansnum))

return fansnum
except Exception as msg:
print("get_fansNum error:%s"%str(msg))
return 1


def get_fansName(self,fansnum):
try:
#判断有几页粉丝,然后分别去处理
if fansnum <=1:
url_page=self.url+"/relation/followers"
else:
url_page=self.url+"/relation/followers?page=%s"%str(fansnum)

print("正在抓取页面:%s"%url_page)

fansnameres=self.ses.get(url_page,verify=False)

fansnamesoup=BeautifulSoup(fansnameres.content,"html.parser")

fansnames=fansnamesoup.find_all(class_="avatar_name")

#将粉丝名字写入文件
for fansname in fansnames:
name=fansname.string.replace("\n"," ").strip(" ")

with open(self.fansNameFile,'a',encoding="utf-8") as file:
file.write(name+"\n")
except Exception as msg:
print("get_fansName error:%s"%str(msg))


if __name__ == '__main__':

#FireFox profile文件路径
profiles = r"C:\Users\Administrator\AppData\Roaming\Mozilla\Firefox\Profiles\wv0f79j4.default"

#要抓取的粉丝的URL
url = "https://home.cnblogs.com/u/NiceTime"

#存放粉丝名字的文件
fansNameFile = "fansNameFile.txt"

#打开浏览器后,等待的时间,单位秒
sleepTime = 5

#获取当前请求的会话
ses = requests.session()

fansName = GetFansName(profiles, url, ses,sleepTime,fansNameFile)

cookies = fansName.get_cookies()

fansName.add_cookies(cookies)

fansNums = fansName.get_fansNum()

for fansNum in range(1, fansNums+1):
fansName.get_fansName(fansNum)



转载于:https://www.cnblogs.com/NiceTime/p/10070139.html


http://www.niftyadmin.cn/n/687769.html

相关文章

python3爬虫例子02(获取个人博客园的文章信息)

#!/usr/bin/env python# -*- coding:UTF-8 -*-import requestsfrom bs4 import BeautifulSoupresrequests.get("https://www.cnblogs.com/NiceTime/")# cres.contentcres.text# print(c)#获取文章日期soupBeautifulSoup(c,"html.parser")postdaysoup.find_…

[整理]一个有关Latch(锁存器)的有趣问题

起源 今天诳论坛&#xff0c;突然发现了一个有关latch的问题&#xff0c;由于对D Flip-Flop和Latch还有些疑问&#xff0c;就点击了进去&#xff0c;一看果然有些意思&#xff0c;也挺有学习意义的&#xff0c;于是本文就诞生了。喊出口号~Just note it. 有意思的问题图 两个问…

linux运维基础篇 unit10

10.系统日志1.系统日志默认分类/var/log/messages ##系统服务及日志&#xff0c;包括服务的信息&#xff0c;报错等等/var/log/secure ##系统认证信息日志/var/log/maillog ##系统邮件服务信息/var/log/cron ##系统…

php 批量插入mysql_PHP批量插入百万数据,php mysql pdo插入百万数据

最近做项目遇到一个小问题&#xff0c;php批量插入大量数据&#xff0c;如果使用for循环来进行插入无疑是很sb的形为&#xff0c;拼接成一条Sql语句&#xff0c;将很好的解决问题一、使用pdo进行数据插入set_time_limit(0);$dbmsmysql; //数据库类型$hostlocalhost; //数据…

python3爬虫03(find_all用法等)

#read1.html文件# <html><head><title>The Dormouses story</title></head># <body># <p class"title"><b>The Dormouses story</b></p>## <p class"story">Once upon a time there we…

希赛和51cto哪个好_软考哪个好?哪个更好考?答案在这!

希赛软考之家关注我们&#xff0c;顺利取证▲点击蓝字关注&#xff0c;获取更多资讯软考有初、中、高三个级别&#xff0c;每个级别又有多个资格考试项目&#xff0c;因此很多考生在报名的时候不知道到底该报考哪个级别的资格&#xff0c; 那么到底考哪个好呢&#xff1f;◆ ◆…

python基础-修改haproxy配置文件

需要掌握的知识&#xff1a; 1、函数 2、文件处理 3、tag的用法 4、程序的解耦 需求&#xff1a; 1&#xff1a;查询 2&#xff1a;添加 3&#xff1a;删除 4&#xff1a;修改 5&#xff1a;退出 haproxy.conf 配置文件内容&#xff1a; 1 global2 log 127.0.0.1 local…

python中常见单词意思_Python中经常使用的单词

您可以先定义一个函数来获取字符串中的所有k-mer&#xff1a;def get_all_k_mer(string, k1):length len(string)return [string[i: i k] for i in xrange(length-k1)]然后你可以使用collections.Counter来计算每个k-mer的重复次数&#xff1a;>>> from collections …