python 爬取各大招聘网站信息

news/2024/7/19 10:11:38 标签: python, selenium, 爬虫

python 爬取各大招聘网站信息,源码,留给自己看的

1、拉勾

from bs4 import BeautifulSoup
import requests
import urllib
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import selenium.webdriver.support.ui as ui
from selenium.webdriver.common.action_chains import ActionChains
import pymysql
from time import sleep
import re
what1 = '数据挖掘'
what2 ='全职'
what3 = '北京'
what1 = urllib.parse.quote(what1)
what2  = urllib.parse.quote(what2)
what3  = urllib.parse.quote(what3)
driver=webdriver.PhantomJS()
# driver=webdriver.Chrome(executable_path='E:\package\Chrome64_48.0.2564.109\chromedriver.exe')
url = 'https://www.lagou.com/jobs/list_%s?px=default&gx=%s&city=%s#order' % (what1,what2,what3)
url2 = 'https://www.lagou.com/jobs/list_%E6%95%B0%E6%8D%AE%E5%88%86%E6%9E%90?labelWords=sug&fromSearch=true&suginput=shuju'
driver.implicitly_wait(100)
driver.get(url)
bs = BeautifulSoup(driver.page_source)
req = bs.find('ul',class_ = 'item_con_list',style ='display: block;')
urllinks = req.find_all('a',class_='position_link')
import queue
que = queue.Queue()
for i in urllinks:
    print(i.get('href'))
    que.put(i.get('href'))
link_next = driver.find_element_by_xpath("//span[@class='pager_next ']")
link_next.click()
times = 0
while True:
    times += 1
    driver.implicitly_wait(10)
    bs = BeautifulSoup(driver.page_source)
    req = bs.find('ul',class_ = 'item_con_list',style ='display: block;')
    urllinks = req.find_all('a',class_='position_link')
    for i in urllinks:
        print(i.get('href'))
        que.put(i.get('href'))
    print(times)
    if times  == 3:
        break
    link_next = driver.find_element_by_xpath("//span[@class='pager_next ']")
    link_next.click()
    sleep(3)


driver2 = webdriver.PhantomJS()
# driver2=webdriver.Chrome(executable_path='E:\package\Chrome64_48.0.2564.109\chromedriver.exe')
while que:
    try :
        newurl = que.get()
        driver2.get(newurl)
        driver2.implicitly_wait(100)
        bs2 = BeautifulSoup(driver2.page_source)

        job_info = bs2.find('div', class_='job-name')
        company = job_info.find('div', class_='company')
        reg1 = re.compile("<[^>]*>")
        ###部门
        company = reg1.sub('', company.prettify())
        ####职位
        job = job_info.find('span', class_='name')
        reg2 = re.compile("<[^>]*>")
        job = reg2.sub('', job.prettify()).strip('\n')
        ###工资 、地点 、经验、学历
        job_req = bs2.find('dd', class_='job_request')
        all_info = []
        for i in job_req.find_all('span'):
            reg3 = re.compile("<[^>]*>")
            new_in = reg3.sub('', i.prettify())
            all_info.append(new_in)

        salary = all_info[0]
        mod = re.compile('/')
        salary = mod.sub('', salary).strip('\n')

        address = all_info[1]
        address = mod.sub('', address).strip('\n')
        exp = all_info[2]
        exp = mod.sub('', exp).strip('\n')
        edu = all_info[3]
        edu = mod.sub('', edu).strip('\n')
        ###job_detail
        job_det = bs2.find('dl', class_='job_detail', id='job_detail')
        ###职位诱惑
        job_lu = job_det.find('dd', class_='job-advantage').find('p')
        reg4 = re.compile("<[^>]*>")
        job_lu = reg4.sub('', job_lu.prettify())
        ###工作责任与要求
        job_zong = job_det.find('dd', class_='job_bt')
        job_res = job_zong.find('div')
        reg5 = re.compile("<[^>]*>")
        job_res = str(reg5.sub('', job_res.prettify()).strip('\n').strip())
        ###工作地址
        job_ad = bs2.find('dd', class_='job-address clearfix').find('div', class_='work_addr')
        reg6 = re.compile("<[^>]*>")
        job_ad = reg6.sub('', job_ad.prettify()).strip('\n')
        job_con = bs2.find('dl', class_='job_company', id='job_company')
        ###公司名称
        com_name = job_con.find('dt').find('a').find('img').get('alt')
        ###公司类型
        com_cat = job_con.find('ul', class_='c_feature').find_all('li')
        all_info2 = []
        for i in com_cat:
            reg7 = re.compile("<[^>]*>")
            new_in = reg7.sub('', i.prettify())
            all_info2.append(new_in)
        com_cat = all_info2[0].strip('\n')
        lingyu  = '领域'
        dev = '发展阶段'
        gui ='规模'

        a1 = re.compile(lingyu)
        a2 = re.compile(dev)
        a3 = re.compile(gui)
        com_cat = a1.sub('',com_cat).strip()
        com_qua = all_info2[1].strip('\n')
        com_qua = a2.sub('',com_qua).strip()
        com_peo = all_info2[-2].strip('\n')
        com_peo = a3.sub('',com_peo).strip()
        db = pymysql.connect('localhost', 'root', 'xin123456789', 'test')
        db.encoding = 'utf-8'
        cursor = db.cursor()
        cursor.execute('set names utf8')

        sql = "INSERT INTO lagou_wajue (job_name,com_name,com_addr,com_cat,com_qua,com_peo,exp1,edu,salary,com_resp) VALUES ('%s','%s','%s','%s','%s','%s','%s','%s','%s','%s') "
        cursor.execute(sql % (job, com_name, address, com_cat, com_qua, com_peo, exp, edu, salary, job_res))

        db.commit()
        cursor.close()
        db.close()
    except:
        print('该页面无法获取')


driver.close()
driver2.close()

2、猎聘

2、1下载 链接

python">from bs4 import BeautifulSoup
import requests
import urllib
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import selenium.webdriver.support.ui as ui
from selenium.webdriver.common.action_chains import ActionChains
import pymysql
from time import sleep
what1 = '数据分析'
what1 = urllib.parse.quote(what1)
driver = webdriver.Chrome(executable_path='E:\package\Chrome64_48.0.2564.109\chromedriver.exe')
for i in range(5):
    page = i
    url = 'https://www.liepin.com/zhaopin/?pubTime=&ckid=5ac323b614701474&fromSearchBtn=2&compkind=&isAnalysis=&init=-1&searchType=1&dqs=070020&industryType=&jobKind=&sortFlag=15&degradeFlag=0&industries=&salary=&compscale=&key=%s&clean_condition=&headckid=5ac323b614701474&curPage=%d' % (
    what1, page)
    # url = 'https://www.liepin.com/bj/zhaopin/?sfrom=click-pc_homepage-centre_searchbox-search_new&key=%s' % what1

    # driver=webdriver.PhantomJS()
    driver.get(url)
    driver.implicitly_wait(100)

    links = driver.find_elements_by_xpath("//div[@class='job-info']/h3")
    w = open('e:/myurl2.txt', 'a', encoding='utf-8')
    for i in links:
        final = i.find_element_by_xpath("./a")
        print(final.get_attribute('href'))
        w.writelines(final.get_attribute('href') + '\n')

    w.close()

2、2 信息抓取

python">from bs4 import BeautifulSoup
import requests
import urllib
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import selenium.webdriver.support.ui as ui
from selenium.webdriver.common.action_chains import ActionChains
import pymysql
import re
from time import sleep
import threading
from threading import current_thread,Lock
import multiprocessing

import queue
class MyThread(threading.Thread):
    def __init__(self, funcs, args, name=''):
        threading.Thread.__init__(self)
        self.funcs = funcs
        self.name = name
        self.args = args

    def run(self):
        self.funcs(*self.args)
def getcontent(que,driver):
    while que:
        try:
            newurl = que.get()
            driver.get(newurl)
            driver.implicitly_wait(100)
            bs2 = BeautifulSoup(driver.page_source)

            job_info = bs2.find('div', class_='title-info')
            company = job_info.find('h3').find('a')
            reg1 = re.compile("<[^>]*>")
            ###部门
            company = reg1.sub('', company.prettify()).strip('\n').strip()
            print(company)
            ####职位
            job = job_info.find('h1')
            reg2 = re.compile("<[^>]*>")
            job = reg2.sub('', job.prettify()).strip('\n')
            print(job)
            ###工资 、地点 、经验、学历
            job_req = bs2.find('div', class_='job-title-left')
            salary = job_req.p.contents[0].strip()
            com_addr = job_req.find('p', class_='basic-infor').find('span').find('a').text
            qua = job_req.find('div', class_='job-qualifications')
            need = []
            for i in qua.find_all('span'):
                need.append(i.text)
            edu = need[0]
            exps = need[1]
            print(edu)
            print(exps)
            print(com_addr)
            print(salary)
            response = bs2.find('div', class_='job-item main-message').find('div', class_='content content-word')
            reg3 = re.compile("<[^>]*>")
            job_res = reg3.sub('', response.prettify()).strip('\n').strip()
            print(job_res)
            com_info = bs2.find('div', class_='company-infor').find('ul').find_all('li')
            infom = []
            for i in com_info:
                infom.append(i.text)
                print(i.text)
            com_cat = infom[0].strip('\n').strip()
            com_peo = infom[1]
            com_qua = infom[2]
            sleep(1)
            db = pymysql.connect('localhost', 'root', 'xin123456789', 'test')
            db.encoding = 'utf-8'
            cursor = db.cursor()
            cursor.execute('set names utf8')

            sql = "INSERT INTO lagou (job_name,com_name,com_addr,com_cat,com_qua,com_peo,exp1,edu,salary,com_resp) VALUES ('%s','%s','%s','%s','%s','%s','%s','%s','%s','%s') "
            cursor.execute(sql % (job, company, com_addr, com_cat, com_qua, com_peo, exps, edu, salary, job_res))

            db.commit()
            cursor.close()
            db.close()
        except:
            print('页面发生错误')








def main():
    w = open('e:/myurl2.txt', 'r', encoding='utf-8')
    urls = []
    for i in w.readlines():
        newline = i.strip()
        urls.append(newline)
    w.close()
    print(len(urls))

    que = queue.Queue()
    for i in urls:
        que.put(i)
    # driver = webdriver.PhantomJS()
    driver = webdriver.Chrome(executable_path='E:\package\Chrome64_48.0.2564.109\chromedriver.exe')
    getcontent(que,driver)




if  __name__== '__main__':
    main()

3、前程无忧

python">from bs4 import BeautifulSoup
import requests
import urllib
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import selenium.webdriver.support.ui as ui
from selenium.webdriver.common.action_chains import ActionChains
import pymysql
import re
from time import sleep
import threading
from threading import current_thread,Lock
import multiprocessing


import queue
class MyThread(threading.Thread):
    def __init__(self, funcs, args, name=''):
        threading.Thread.__init__(self)
        self.funcs = funcs
        self.name = name
        self.args = args

    def run(self):
        self.funcs(*self.args)
def getcontent(que,driver):
    while que:
        try:
            newurl = que.get()
            driver.get(newurl)
            driver.implicitly_wait(100)
            bs2 = BeautifulSoup(driver.page_source)
            job_info = bs2.find('div', class_='cn')
            company = job_info.find('p',class_='cname').find('a')

            ###gongsi
            company = company.get('title')

            ####职位
            job = job_info.find('h1')

            job = job.get('title')

            ###工资 、地点 、经验、学历
            com_addr = job_info.find('span',class_='lname').text
            salary = job_info.find('strong').text
            com_all= job_info.find('p',class_='msg ltype').text.strip('\t').strip('\n').split('|')
            com_qua = com_all[0].strip('\n').strip()
            com_peo = com_all[1].strip('\n').strip()
            com_cat = com_all[2].strip('\n').strip()
            print(com_qua)

            job_main = bs2.find('div',class_= 'tCompany_main')
            info_all = []
            for i in job_main.find_all('span',class_='sp4'):
                info_all.append(i.text)
            exps = info_all[0].strip()
            edu = info_all[1].strip()
            if '经验' not in exps:
                exps =None
            if edu not in ['初中及以下','高中/中技/中专','大专','本科','硕士','博士']:
                edu =None
            job_res = job_main.find('div',class_='bmsg job_msg inbox')
            reg3 = re.compile("<[^>]*>")
            job_res = reg3.sub('', job_res.prettify()).strip('\n').strip()
            reg4 = re.compile('分享')
            reg5 = re.compile('举报')
            job_res =  reg4.sub('', job_res).strip('\n').strip()
            job_res = reg5.sub('', job_res).strip('\n').strip()

            db = pymysql.connect('localhost', 'root', 'xin123456789', 'test')
            db.encoding = 'utf-8'
            cursor = db.cursor()
            cursor.execute('set names utf8')

            sql = "INSERT INTO lagou (job_name,com_name,com_addr,com_cat,com_qua,com_peo,exp1,edu,salary,com_resp) VALUES ('%s','%s','%s','%s','%s','%s','%s','%s','%s','%s') "
            cursor.execute(sql % (job, company, com_addr, com_cat, com_qua, com_peo, exps, edu, salary, job_res))

            db.commit()
            cursor.close()
            db.close()
        except:
            print('页面发生错误')


def main():
    w = open('e:/myurl10.txt', 'r', encoding='utf-8')
    urls = []
    for i in w.readlines():
        newline = i.strip()
        urls.append(newline)
    w.close()
    print(len(urls))

    que = queue.Queue()
    for i in urls:
        que.put(i)
    # driver = webdriver.PhantomJS()
    driver = webdriver.Chrome(executable_path='E:\package\Chrome64_48.0.2564.109\chromedriver.exe')
    getcontent(que,driver)




if  __name__== '__main__':
    main()

4 中华英才

python">from bs4 import BeautifulSoup
import requests
import urllib
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import selenium.webdriver.support.ui as ui
from selenium.webdriver.common.action_chains import ActionChains
import pymysql
import re
from time import sleep
import threading
from threading import current_thread,Lock
import multiprocessing


import queue
class MyThread(threading.Thread):
    def __init__(self, funcs, args, name=''):
        threading.Thread.__init__(self)
        self.funcs = funcs
        self.name = name
        self.args = args

    def run(self):
        self.funcs(*self.args)
def getcontent(que,driver):
    while que:
        try:
            newurl = que.get()
            driver.get(newurl)
            driver.implicitly_wait(100)
            bs2 = BeautifulSoup(driver.page_source)
            job_info = bs2.find('div', class_='base_info')
            ####职位
            job = job_info.find('div').find('h1').find('span').text

            ###工资 、地点 、经验、学历

            min_info = job_info.find('div',class_='job_require')
            all_in = []
            for i in min_info.find_all('span'):
                print(i.text)
                all_in.append(i.text)
            print(all_in)
            salary = all_in[0].strip()
            com_addr = all_in[1].strip()
            edu = all_in[3].strip()
            exps = all_in[4].strip()
            job_main = bs2.find('div',class_= 'job_intro_wrap')

            job_res = job_main.find('div',class_='job_intro_info')
            reg3 = re.compile("<[^>]*>")
            job_res = reg3.sub('', job_res.prettify()).strip('\n').strip()
            com_intro = bs2.find('div',class_='job-company jrpadding')

            company = com_intro.find('h4').find('a').text.strip()
            print(company)
            com_info = com_intro.find('tbody').find_all('tr')

            com_s = []
            for i in com_info:
                times = 0
                for j in i.find_all('td'):
                    times += 1

                    if times ==2 :

                        com_s.append(j.text)
            com_cat = com_s[0].strip()
            com_qua = com_s[2].strip()
            com_peo = com_s[1].strip()
            print(job, company, com_addr, com_cat, com_qua, com_peo, exps, edu, salary, job_res)
            sleep(1)
            db = pymysql.connect('localhost', 'root', 'xin123456789', 'test')
            db.encoding = 'utf-8'
            cursor = db.cursor()
            cursor.execute('set names utf8')
            sql = "INSERT INTO lagou (job_name,com_name,com_addr,com_cat,com_qua,com_peo,exp1,edu,salary,com_resp) VALUES ('%s','%s','%s','%s','%s','%s','%s','%s','%s','%s') "
            cursor.execute(sql % (job, company, com_addr, com_cat, com_qua, com_peo, exps, edu, salary, job_res))

            db.commit()
            cursor.close()
            db.close()
        except:
            print('页面发生错误')


def main():
    w = open('e:/myurl8.txt', 'r', encoding='utf-8')
    urls = []
    for i in w.readlines():
        newline = i.strip()
        urls.append(newline)
    w.close()
    print(len(urls))

    que = queue.Queue()
    for i in urls:
        que.put(i)
    driver = webdriver.PhantomJS()
    # driver = webdriver.Chrome(executable_path='E:\package\Chrome64_48.0.2564.109\chromedriver.exe')
    getcontent(que,driver)


if  __name__== '__main__':
    main()

http://www.niftyadmin.cn/n/1870411.html

相关文章

Python快速排序

代码如下&#xff0c;快排采用分治的思想 def quick(arr,f,l):if f > l:returnelse:i f - 1for j in range(f,l):if arr[j] < arr[l]:i 1arr[i], arr[j] arr[j],arr[i]arr[i1] , arr[l] arr[l],arr[i1]quick(arr,f,i)quick(arr,i1,l)arr[0,8,6,5,8,9,4,7] quick(arr…

python实现剑指offer系列:二维数组的查找

题目&#xff1a;在一个二维数组中&#xff0c;每一行都按照从左到右递增的顺序排序&#xff0c;每一列都按照从上到下递增的顺序排序。请完成一个函数&#xff0c;输入这样的一个二维数组和一个整数&#xff0c;判断数组中是否含有该整数。 python实现代码 # -*- coding:utf…

Java8默认使用的GC类型

先说结论 默认使用的是 Parallel Scavenge (新生代) 和 Parallel Old (老年代),基于我的Jdk 1.8.0_181-b13版本 1 先简单的介绍Java目前的垃圾收集器 连线的部分标识可以配合使用,对垃圾收集器不做过多介绍 JavaGC.jpg 他们对应的JVM参数如下 新生代(别名)老年代JVM 参数Ser…

python实现剑指offer系列2:字符串空格替换

牛客网题目链接&#xff1a;https://www.nowcoder.com/practice/4060ac7e3e404ad1a894ef3e17650423?tpId13&tqId11155&tPage1&rp1&ru%2Fta%2Fcoding-interviews&qru%2Fta%2Fcoding-interviews%2Fquestion-ranking 题目&#xff1a;请实现一个函数&#x…

夏季国内10大避暑胜地指南|7、8、9月暑期最适合旅行目的地

夏天是炽热真诚的 最近除了被高温烘烤 很多小伙伴都还想出去嗨皮 夏天&#xff0c;怎么能不来一场说走就走的旅行呢&#xff1f; 打印君在认&#xff08;hua&#xff09;真(shui)上(mo)班(yu)的时间里 给大家整理了适合夏天出行的目的地 不看对不起我这么认真&#xff01…

JDK8 JVM参数与实际环境中的优化配置实践

如何配置我们的JVM呢&#xff0c;首先我们需要知道JVM的参数有哪些&#xff0c;然后根据这些参数的意义去分析自己的程序的JVM需要的配置。可以事先做一些尝试&#xff0c;再逐步调优&#xff0c;这个调优也是一个过程&#xff0c;没有足够的经验而做到一步到位是一件很困难的事…

python 实现剑指offer系列3:从尾到头打印链表

newcoder&#xff1a;https://www.nowcoder.com/practice/d0267f7f55b3412ba93bd35cfa8e8035?tpId13&tqId11156&tPage1&rp1&ru%2Fta%2Fcoding-interviews&qru%2Fta%2Fcoding-interviews%2Fquestion-ranking 题目&#xff1a;输入一个链表&#xff0c;从尾…

Java中的堆内存设置对线程创建数的影响以及-Xss参数的记录

Java的线程对象是存储在堆上的,所以,能够创建多少个线程,受到堆空间的大小限制,同时也受到每个线程的大小的限制,假如线程对象内部有一个非常大的数组字段,那就非常影响能够创建的线程的大小 我们的例子: class Task implements Runnable {//5M堆内存 128k的情况下,最多创建3…