jsoup是一款Java的HTML解析器,主要用来对HTML解析。参考文档:
https://www.open-open.com/jsoup/
虽然jsoup支持从某个地址直接去爬取网页源码,但是只支持HTTP,HTTPS协议,支持不够丰富。所以,主要还是用来对HTML进行解析,从HTML中抽取自己想要的数据。
利用jsoup爬取汇博网java职位列表数据
1、maven依赖
<dependencies>
<!-- https://mvnrepository.com/artifact/org.jsoup/jsoup -->
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.12.1</version>
</dependency>
</dependencies>
2、编码获取java职位列表
package work.chenchuan.crawler.jsoup;
import org.jsoup.Connection;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
public class DataFromHuibo {
public static void main(String[] args) {
//职位列表
List<Map<String, Object>> list = new ArrayList<>();
//获取所有职位列表
list = getAllData(1, list);
for (Map<String, Object> cur : list) {
System.out.print(cur.get("companyName"));
System.out.print(" ");
System.out.print(cur.get("name"));
System.out.print(" ");
System.out.print(cur.get("money"));
System.out.print(" ");
System.out.print(cur.get("address"));
System.out.print(" ");
System.out.print(cur.get("exp"));
System.out.print(" ");
System.out.print(cur.get("detaiUrl"));
System.out.print("\n");
}
}
/**
* 获取java职位每页所有列表数据
*
* @param p 当前页码
* @param list 职位列表
* @return
*/
public static List<Map<String, Object>> getAllData(int p, List<Map<String, Object>> list) {
//创建并返回URL的连接
Connection connection = Jsoup.connect("https://www.huibo.com/jobsearch/?params=p" + p + "&key=java");
Document document = null;
try {
//获取html文档
document = connection.get();
//获取列表dom节点
Elements elements = document.select("#job_list_table .postIntro");
//遍历节点获取数据并添加职位列表到list
for (Element element : elements) {
String companyName = element.select("div.title>a").attr("title");
Elements infoDoms = element.select(".postIntroList .postIntroL");
for (Element infoDom : infoDoms) {
String name = infoDom.select(".des_title").text();
String money = infoDom.select(".money").text();
String address = infoDom.select(".address").text();
String exp = infoDom.select(".exp").text();
String detaiUrl = infoDom.select(".name>a").attr("abs:href");
Map<String, Object> curInfo = new HashMap<>();
curInfo.put("companyName", companyName);
curInfo.put("name", name);
curInfo.put("money", money);
curInfo.put("address", address);
curInfo.put("exp", exp);
curInfo.put("detaiUrl", detaiUrl);
list.add(curInfo);
}
}
//当前页公司条数
int curPageSize = elements.size();
//每页职位条数和公司条数不同,当前页公司条数>0查找下一页职位
if (curPageSize > 0) {
p++;
getAllData(p, list);
}
} catch (IOException e) {
e.printStackTrace();
}
return list;
}
/**
* 获取java职位列表首页数据
*/
public static void getData() {
//创建并返回URL的连接
Connection connection = Jsoup.connect("https://www.huibo.com/jobsearch/?params=p1&key=java×tamp=1566277536#list");
//获取html文档
Document document = null;
try {
document = connection.get();
//获取列表dom节点
Elements elements = document.select(".postIntroList .clearfix");
//遍历节点获取数据
for (Element element : elements) {
//获取节点相关属性
String name = element.select(".des_title").text();
String money = element.select(".money").text();
String address = element.select(".address").text();
String exp = element.select(".exp").text();
System.out.print(name);
System.out.print(" ");
System.out.print(money);
System.out.print(" ");
System.out.print(address);
System.out.print(" ");
System.out.print(exp);
System.out.print("\n");
}
} catch (IOException e) {
e.printStackTrace();
}
}
}
以上程序打印结果如下: