技术交流


JSOUP爬虫

Aug 21, 2019 2:02:32 PM
75
1

jsoup是一款JavaHTML解析器,主要用来对HTML解析。参考文档:

https://jsoup.org/

https://www.open-open.com/jsoup/

 

虽然jsoup支持从某个地址直接去爬取网页源码,但是只支持HTTPHTTPS协议,支持不够丰富。所以,主要还是用来对HTML进行解析HTML中抽取自己想要的数据。

 

利用jsoup爬取汇博网java职位列表数据

1、maven依赖

<dependencies>
    <!-- https://mvnrepository.com/artifact/org.jsoup/jsoup -->
    <dependency>
        <groupId>org.jsoup</groupId>
        <artifactId>jsoup</artifactId>
        <version>1.12.1</version>
    </dependency>
</dependencies>

 

2、编码获取java职位列表

package work.chenchuan.crawler.jsoup;

import org.jsoup.Connection;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

public class DataFromHuibo {
    public static void main(String[] args) {
        //职位列表
        List<Map<String, Object>> list = new ArrayList<>();
        //获取所有职位列表
        list = getAllData(1, list);
        for (Map<String, Object> cur : list) {
            System.out.print(cur.get("companyName"));
            System.out.print("     ");
            System.out.print(cur.get("name"));
            System.out.print("     ");
            System.out.print(cur.get("money"));
            System.out.print("     ");
            System.out.print(cur.get("address"));
            System.out.print("     ");
            System.out.print(cur.get("exp"));
            System.out.print("     ");
            System.out.print(cur.get("detaiUrl"));
            System.out.print("\n");
        }
    }

    /**
     * 获取java职位每页所有列表数据
     *
     * @param p    当前页码
     * @param list 职位列表
     * @return
     */
    public static List<Map<String, Object>> getAllData(int p, List<Map<String, Object>> list) {
        //创建并返回URL的连接
        Connection connection = Jsoup.connect("https://www.huibo.com/jobsearch/?params=p" + p + "&key=java");
        Document document = null;
        try {
            //获取html文档
            document = connection.get();
            //获取列表dom节点
            Elements elements = document.select("#job_list_table .postIntro");
            //遍历节点获取数据并添加职位列表到list
            for (Element element : elements) {
                String companyName = element.select("div.title>a").attr("title");
                Elements infoDoms = element.select(".postIntroList .postIntroL");
                for (Element infoDom : infoDoms) {
                    String name = infoDom.select(".des_title").text();
                    String money = infoDom.select(".money").text();
                    String address = infoDom.select(".address").text();
                    String exp = infoDom.select(".exp").text();
                    String detaiUrl = infoDom.select(".name>a").attr("abs:href");
                    Map<String, Object> curInfo = new HashMap<>();
                    curInfo.put("companyName", companyName);
                    curInfo.put("name", name);
                    curInfo.put("money", money);
                    curInfo.put("address", address);
                    curInfo.put("exp", exp);
                    curInfo.put("detaiUrl", detaiUrl);
                    list.add(curInfo);
                }
            }
            //当前页公司条数
            int curPageSize = elements.size();
            //每页职位条数和公司条数不同,当前页公司条数>0查找下一页职位
            if (curPageSize > 0) {
                p++;
                getAllData(p, list);
            }
        } catch (IOException e) {
            e.printStackTrace();
        }
        return list;
    }


    /**
     * 获取java职位列表首页数据
     */
    public static void getData() {
        //创建并返回URL的连接
        Connection connection = Jsoup.connect("https://www.huibo.com/jobsearch/?params=p1&key=java×tamp=1566277536#list");
        //获取html文档
        Document document = null;
        try {
            document = connection.get();
            //获取列表dom节点
            Elements elements = document.select(".postIntroList .clearfix");
            //遍历节点获取数据
            for (Element element : elements) {
                //获取节点相关属性
                String name = element.select(".des_title").text();
                String money = element.select(".money").text();
                String address = element.select(".address").text();
                String exp = element.select(".exp").text();
                System.out.print(name);
                System.out.print("     ");
                System.out.print(money);
                System.out.print("     ");
                System.out.print(address);
                System.out.print("     ");
                System.out.print(exp);
                System.out.print("\n");
            }
        } catch (IOException e) {
            e.printStackTrace();
        }
    }
}

 

以上程序打印结果如下:

6e8ef65c-8f2e-4c51-9de7-5fddce186487 



如果你喜欢我的内容,就请打赏一下吧
微信
支付宝
温馨提示: 你的打赏金额会直接转入对方账户,不可退回。

评论专区


审核通过的评论(0)
暂无评论信息
个人名片

  欢迎来到“浩瀚星尘”的个人博客!
  首先,该博客用于分享本人的生活事迹与兴趣爱好; 此外,该博客的主要作用便是与广大的小伙伴一起分享探讨开发技术, 希望大家多多关照。

网名: 浩瀚星尘
城市: 重庆
工作: java