简单JAVA爬虫51Jobs

news/2024/7/19 10:00:48 标签: java, 爬虫, javascript

使用Jsoup工具,它是一个HTML解析器,可以直接直接解析某个地址或者HTML文件。还可 通过Dom,CSS以及类似JQuery的操作方法操作数据。

Jsoup官方文档地址:https://jsoup.org/cookbook/introduction/parsing-a-document

注意:出现乱码时,需要查看编码方式网页的编码方式,使用它的编码方式解码。使用表单传输中文数据时有些网站需要进行url编码才能正常传输中文=。=

主要代码如下:

package com.galoliy.spider.maven_spider.domain;

import java.io.BufferedInputStream;
import java.io.BufferedOutputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.io.UnsupportedEncodingException;
import java.net.URLEncoder;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

import org.jsoup.Connection;
import org.jsoup.Connection.Method;
import org.jsoup.Connection.Response;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

public class Cat5jobs {

    public Document getResultPage(String url,String keyword) throws UnsupportedEncodingException {
        Document doc = null;
        
        //multipart/form-data 编码类型转换,必须进行转换,不然会导致POST里的keyword乱码
        //Multipart/form-data code type conversion must be converted, otherwise it will cause keyword confusion in POST.
        keyword = URLEncoder.encode(keyword, "gbk");
        
        try {
            
            //获取主页
            //Get index page
            Response resp = Jsoup.connect(url).method(Method.GET).execute();
            doc = resp.parse();
        
            //获取查询结果页的跳转链接
            //Get query results jump page link
            String    actionPath = doc.select("form").attr("action");
            
             Connection con = Jsoup.connect(actionPath)
                    .data("keyword", keyword)
                    .userAgent("Mozilla")
                    .cookies(resp.cookies())
                    .header("Accept-Language", "zh-CN,zh;q=0.9")
                    .timeout(300000);
             //得到查询结果页面
             //Get query results page
            doc = con.method(Method.POST).execute().parse();
            
        } catch (IOException e) {
            e.printStackTrace();
        }
        return doc;
    }
    

    public void getResult(String url,String keyword,String dir,String fileName) {

        Document doc = null;
        File htmlPath = null;
        File txtPath = null;
        String htmlFilePath = dir + fileName + ".htm";
        String txtFilePath = dir + fileName + "2.txt";
        txtPath = new File(txtFilePath);
        htmlPath = new File(htmlFilePath);
        Map map = null;
        String printSrc = "";
        
        try {
            //本地如果有html文件则解析该文件,打印内容并储存一个txt文件
            //If there is a HTML file in the local area, parse the file, print the contents and store a TXT file.
            if(!txtPath.exists() && htmlPath.exists()) { 
                
                doc = Jsoup.parse(htmlPath, "utf-8");
                
                if(!doc.children().isEmpty()) 
                    System.out.println("File not empty");
                
                map = Screen51Jobs(doc);
                printSrc = printScreen(map);
                saveFile(printSrc, txtFilePath);
                System.out.println(printSrc);
                
            //如果本地有html和txt文件则读取txt文件内容,否则抛出IOException
            //If you have HTML and txt files locally, you can read the contents of the txt file, otherwise throw IOException.
            }else if(txtPath.exists() && htmlPath.exists()) {
                System.out.println("File not empty");
                printSrc = printScreen(txtPath);
                System.out.println(printSrc);
            }else
                throw new IOException("NOT HTML FILE");
            
        } catch (IOException e) { //在catch块里执行爬虫并且把文件保存在本地,Execute crawler in catch block and save the file locally.
            
            System.out.println("file not found");

            try {
                
                //从网址上获取查询结果页面
                //Get query results page from web address
                doc = this.getResultPage(url,keyword);

                htmlPath.createNewFile();
                //存储html文件
                //Save html file
                saveFile(doc.toString(),htmlFilePath);

                map = Screen51Jobs(doc);
                String printStr = printScreen(map);

                if(!htmlPath.exists())
                    htmlPath.createNewFile();
                //存储txt文件
                //Save txt file
                saveFile(printStr, txtFilePath);
                
                System.out.println(printSrc);
                
            } catch (IOException ex) {
                ex.printStackTrace();
            }
        }
    
    }
    
    private String printScreen(File path) throws IOException{
        
        StringBuilder printSrc = new StringBuilder();
        InputStream in = new FileInputStream(path);
        BufferedInputStream bis = new BufferedInputStream(in);
        
        int len = 0;
        byte[] bytes = new byte[1024 * 8];
        while((len = bis.read(bytes, 0, bytes.length)) != -1) {
            printSrc.append(new String(bytes,0,bytes.length));
        }
        bis.close();
        
        return printSrc.toString();
    }
    
    private String printScreen(Map<?,?> screen) throws IOException {
        
        StringBuilder sb = new StringBuilder();
        String p = "\r\n";
        sb.append(p + " KeyWord:" + screen.get("keyword") + p + p +" Total query data:" 
                    + screen.get("totalquerydata") + p + p + " Recruitment info:");
        
        List list = (ArrayList)screen.get("recruitmentlist");

        for (Object o : list) {
            Map map = (HashMap<String,Object>)o;

            for (Object obj : map.entrySet()) {
                Map.Entry<String, Object> entry = (Map.Entry<String, Object>)obj;
                sb.append(p + entry.getKey() + " == " + entry.getValue());
            }
            sb.append(p);
        }
        
        return sb.toString();
    }
    
    @SuppressWarnings({ "rawtypes", "unchecked" })
    private Map<?,?> Screen51Jobs(Document doc){
        
        Map screen = new HashMap<String,Object>(); 
        
        Elements resultList = doc.select("div[class=dw_table]div[id=resultList]");
        Elements findKeyword = resultList.select("div[class=sbox]");
        Elements totalQueryData = resultList.select("div[class=rt]div:matchesOwn(^共)");
        Elements recruitmentInfo = resultList.select("div[class=el]");
    
        screen.put("keyword", findKeyword.text());
        screen.put("totalquerydata", totalQueryData.text());
        
        List recruitmentList = new ArrayList<Map<String,String>>(); 
        Map m = null;
        for (Element e : recruitmentInfo) {
            m = new HashMap<String,Object>();
            m.put("position",e.select("p[class~=^t1]").text());
            m.put("href", e.select("a").attr("href"));
            m.put("corporatename", e.select("a").text());
            m.put("address", e.select("span[class=t3]").text());
            m.put("salary", e.select("span[class=t4]").text());
            m.put("releasedate", e.select("span[class=t5]").text());
            recruitmentList.add(m);
        }
        screen.put("recruitmentlist", recruitmentList);
        
        return screen;
    }
    
    private void saveFile(String src,String path) throws IOException {

    //    InputStream in = new FileInputStream(path);
        OutputStream out = new FileOutputStream(path);
        BufferedOutputStream bos = new BufferedOutputStream(out);
        
        byte[] bytes = src.getBytes("utf-8");
        
        bos.write(bytes, 0, bytes.length);        
    }

 

转载于:https://www.cnblogs.com/galoliy/p/simple_spider.html


http://www.niftyadmin.cn/n/1472138.html

相关文章

用户输入和while语句

# 第七章 # 用户输入和while语句# 1.input()输入 name input("请输入你喜欢的语言&#xff1a;") print("我最喜欢的语言是" name)# 2.int()输入 age input("你几岁了&#xff1f;") int_age int(age) while int_age>18:print("你成…

C++提高9: 类模板与友元

学习目标&#xff1a; 掌握类模板配合友元函数的类内和类外实现 全局函数类内实现 - 直接在类内声明友元即可全局函数类外实现 - 需要提前让编译器知道全局函数的存在 先来一个类内实现的例子&#xff1a; #include<string> #include<iostream> using namespac…

软工实践-个人项目

软工个人实践项目-文本词频统计 Github项目地址&#xff1a; https://github.com/MercuialC/personal-project PSP表格&#xff1a; PSP2.1Personal Software Process Stages预估耗时&#xff08;分钟&#xff09;实际耗时&#xff08;分钟&#xff09;Planning计划1010 Estima…

C++提高10:STL初识

1、STL的诞生 长久以来&#xff0c;软件界一直希望建立一种可重复利用的东西&#xff0c;而C的面向对象和泛型编程思想&#xff0c;目的就是复用性的提升。大多情况下&#xff0c;数据结构和算法都未能有一套标准,导致被迫从事大量重复工作&#xff0c;为了建立数据结构和算法…

Eureka安全下线服务

Eureka上的服务是通过心跳检测注册到上面&#xff0c;如果stop或者直接kill掉服务的话&#xff0c;Eureka并不能实时的检测到服务下线&#xff0c;并且如果直接kill还可能杀掉正在处理业务的服务&#xff0c;所以我们要找到一个方法安全下线服务。 一&#xff1a;发送DELETE请求…

Applet

定义&#xff1a;Applet 是一种采用Java编程语言编写的 Java 程序&#xff0c;该程序可以包含在 HTML&#xff08;标准通用标记语言的一个应用&#xff09;页中&#xff0c;与在页中包含图像的方式大致相同。它一般运行在支持 Java 的 Web 浏览器内。因为它有完整的 Java API支…

C++提高11:容器算法vector举例

容器算法迭代器初识 STL中最常用的容器为Vector&#xff0c;可以理解为数组&#xff0c;下面我们将学习如何向这个容器中插入数据、并遍历这个容器。 1 vector存放内置数据类型 容器&#xff1a; vector 算法&#xff1a; for_each 迭代器&#xff1a; vector::iterator 遍历…

hbase集群中其中某个regionserver 没有启动

第一步&#xff1a; date命令查看各个机器的时间 发现linux02机器时间差了2个多小时 第二步&#xff1a;ntpdate pool.ntp.org 修改错误时间的机器 第三步&#xff1a; 重启 hbase 转载于:https://www.cnblogs.com/spicy/p/9627842.html