Java爬虫实现,HttpClient整合JSoup

从春节到现在,新型冠状病毒像一股含有毒气的烟雾,一直笼罩在人们的头顶上空,并且近期没有散去的迹象,真令人堪忧呀。每天在微信、支付宝刷着疫情最新消息,希望确诊、疑似、死亡病例较昨日能够下降一些,也无意中发现他们是从各地方的卫健委获取数据的,他们是怎么获取疫情的这些统计数据的呢?作为程序员不由得想试一试。

首先,在pom文件中引入HttpClient、JSoup的架包;

<!-- 数据抓取架包start -->
    <!-- https://mvnrepository.com/artifact/org.apache.httpcomponents/httpclient -->
    <dependency>
        <groupId>org.apache.httpcomponents</groupId>
        <artifactId>httpclient</artifactId>
    </dependency>
    <!-- https://mvnrepository.com/artifact/org.jsoup/jsoup -->
    <dependency>
        <groupId>org.jsoup</groupId>
        <artifactId>jsoup</artifactId>
        <version>1.11.3</version>
    </dependency>   
    <!-- 数据抓取架包end -->

第二步,把HttpClient的get方法,封装成一个工具类;

import org.apache.http.HttpEntity;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClientBuilder;
import org.apache.http.util.EntityUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
 * 数据抓取工具类
 * @author 程就人生
 * @Date
 */
public class GetResult {

    private static Logger log = LoggerFactory.getLogger(GetResult.class);
    
    /**
     * 读取url地址,对于非utf-8页面进行编码
     * @param url
     * @return
     */
    public static String getResult(String url){
        //这里用了try-with-resource语法,在try()括号中的资源会在try语句块执行完之后自动释放        
        try (           
            CloseableHttpClient httpClient = HttpClientBuilder.create().build();
            CloseableHttpResponse response = httpClient.execute(new HttpGetConfig(url))        
        ){            
            HttpEntity httpEntity = response.getEntity(); 
            String result = null;
            log.info(httpEntity.getContentType().toString());
            //不是utf-8时,进行编码设置
            if(!httpEntity.getContentType().getValue().contains("utf-8")
                    &&!httpEntity.getContentType().getValue().contains("UTF-8") && !httpEntity.getContentType().getValue().contains("gb2312")){
                result = EntityUtils.toString(httpEntity, "utf-8");
            }else{
                result = EntityUtils.toString(httpEntity); 
            }           
            return result;        
        } catch (Exception e) {            
            log.info("获取失败");          
            return "";        
        }        
        //所以不需要再finally中释放资源。
    }
    
    /**
     * 读取url地址,对于非utf-8页面进行编码,根据网页来源设置固定的编码
     * @param url
     * @param encode
     * @return
     */
    public static String getResult(String url, String encode){
        //这里用了try-with-resource语法,在try()括号中的资源会在try语句块执行完之后自动释放        
        try (
            
            CloseableHttpClient httpClient = HttpClientBuilder.create().build();
            //.setUserAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 Edge/18.18362").build();     
            CloseableHttpResponse response = httpClient.execute(new HttpGetConfig(url))        
        ){            
            HttpEntity httpEntity = response.getEntity(); 
            String result = null;
            log.info(httpEntity.getContentType().toString());
            result = EntityUtils.toString(httpEntity,encode);                   
            return result;        
        } catch (Exception e) {            
            log.info("获取失败");           
            return "";        
        }        
        //所以不需要再finally中释放资源。
    }
}

//内部类,继承HttpGet,为了设置请求超时的参数
class HttpGetConfig extends HttpGet {

    public HttpGetConfig(String url) {

        super(url);

        setDefaulConfig();

    }
    
    private void setDefaulConfig() {

        this.setConfig(RequestConfig.custom()
                //连接请求超时
                .setConnectionRequestTimeout(10000)
                //连接超时
                .setConnectTimeout(10000)
                //socket超时
                .setSocketTimeout(10000).build());
        //请求头配置
        this.setHeader("User-Agent", "spider");
        //解决403 Forbidden问题
        this.setHeader("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 Edge/18.18362");
        //this.setHeader("Referer","http://,,,");
    }
}

JSoup也是可以用来抓取网页的,但是没有HttpClient专业;JSoup更善于解析页面上的内容,下面就看看如果会使用JSoup来解析网页吧。

第三步,使用工具类,获取页面的内容;

import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.stereotype.Service;

import com.example.demo.util.GetResult;

/**
 * 安徽卫健委数据抓取
 * @author FengJuan
 * @Date
 */
@Service
public class AnhuiCapture {
    
    String virusName = "新型冠状病毒";
    
    private static Logger log = LoggerFactory.getLogger(AnhuiCapture.class);

    public void getContentFromUrl() {
        try {           
            String url = "http://wjw.ah.gov.cn/news_list_477_1.html";           
            String date = "2月9日";
            String content = GetResult.getResult(url);
            //得到document对象
            Document doc = Jsoup.parse(content);   
            //获取父级元素
            Element newsList = doc.select(".list").get(0);  
            //获取第一个a
            String title = newsList.select("a").get(0).text();  
            if(title.contains(date) && title.contains(virusName)){
                String dataUrl = "http://wjw.ah.gov.cn/" + newsList.select("a").get(0).attr("href");
                content = GetResult.getResult(dataUrl);
                doc = Jsoup.parse(content);
                newsList = doc.select("#art_content").get(0);
                //对span标签进行循环处理
                Elements elements = newsList.select("span");
                List<Map<String,Object>>  infoList = new ArrayList<>();
                Map<String,Object> statisticsInfo = null;
                //新增确诊
                content = elements.get(0).text();
                String[] array = content.substring(content.indexOf(",")+1).split("、");
                //数量
                String num = null;
                String secName = null;
                for(String arr : array){
                    statisticsInfo = new HashMap<String,Object>();
                    statisticsInfo.put("firstName", "安徽省");
                    num = arr.replaceAll("[^0-9]","");
                    //市区名称
                    secName = arr.substring(0, arr.indexOf(num));
                    statisticsInfo.put("secName", secName);
                    //设置新增确诊的数量
                    statisticsInfo.put("newConfirm", num);
                    infoList.add(statisticsInfo);
                }
                
                //新增疑似
                content = elements.get(1).text();
                array = content.substring(content.indexOf(",")+1).split("、");        
                boolean isExist = false;
                for(String arr : array){        
                    //默认不存在
                    isExist = false;
                    num = arr.replaceAll("[^0-9]","");
                    //市区名称
                    secName = arr.substring(0, arr.indexOf(num));                   
                    for(Map<String,Object> s : infoList){
                        //市区名称已经存在时
                        if(s.get("secName").equals(secName)){
                            //新增疑似的数量
                            s.put("newSuspected", num);
                            isExist = true;
                            break;
                        }
                    }
                    //不存在时,新增
                    if(!isExist){
                        statisticsInfo = new HashMap<String,Object>();
                        statisticsInfo.put("firstName", "安徽省");
                        //市区名称
                        statisticsInfo.put("secName", secName);
                        statisticsInfo.put("newSuspected", num);
                        infoList.add(statisticsInfo);
                    }
                }               
                //新增治愈
                content = elements.get(2).text();
                array = content.substring(content.indexOf(",")+1).split("、");   
                for(String arr : array){        
                    //默认不存在
                    isExist = false;
                    num = arr.replaceAll("[^0-9]","");
                    //市区名称
                    secName = arr.substring(0, arr.indexOf(num));                   
                    for(Map<String,Object> s : infoList){
                        //市区名称已经存在时
                        if(s.get("secName").equals(secName)){
                            //新增治愈的数量
                            s.put("newCure", num);
                            isExist = true;
                            break;
                        }
                    }
                    //不存在时,新增
                    if(!isExist){
                        statisticsInfo = new HashMap<String,Object>();
                        statisticsInfo.put("firstName", "安徽省");
                        //市区名称
                        statisticsInfo.put("secName", secName);
                        statisticsInfo.put("newCure", num);
                        infoList.add(statisticsInfo);
                    }
                }
                //新增死亡
                content = elements.get(3).text();
                array = content.substring(content.indexOf(",")+1).split("、");   
                for(String arr : array){        
                    //默认不存在
                    isExist = false;
                    num = arr.replaceAll("[^0-9]","");
                    //市区名称
                    secName = arr.substring(0, arr.indexOf(num));                   
                    for(Map<String,Object> s : infoList){
                        //市区名称已经存在时
                        if(s.get("secName").equals(secName)){
                            //新增治愈的数量
                            s.put("newDeath", num);
                            isExist = true;
                            break;
                        }
                    }
                    //不存在时,新增
                    if(!isExist){
                        statisticsInfo = new HashMap<String,Object>();
                        statisticsInfo.put("firstName", "安徽省");
                        //市区名称
                        statisticsInfo.put("secName", secName);
                        statisticsInfo.put("newDeath", num);
                        infoList.add(statisticsInfo);
                    }
                }
                for(Map<String,Object> s : infoList){
                    log.info(s.get("firstName").toString() + s.get("secName").toString() 
                            + ",新增确诊:" + (s.containsKey("newConfirm")?s.get("newConfirm").toString():"0") 
                            + ",新增疑似:" + (s.containsKey("newSuspected")?s.get("newSuspected").toString():"0")
                            + ",新增出院:" + (s.containsKey("newCure")?s.get("newCure").toString():"0")
                            + ",新增死亡:" + (s.containsKey("newDeath")?s.get("newDeath").toString():"0"));
                }
            }
            
        }catch(Exception e){
            e.printStackTrace();
            log.error("安徽省统计数据获取异常!");
        }
    }
}

最后,启动项目,查看测试结果;

测试结果

可能遇到的问题:
1.网页上明明设置的utf-8编码,抓取后却没有编码,导致抓到的内容是乱码;这时可以通过在工具类GetResult 中加入utf-8的编码;
2.网页中设置的编码是gb2312,抓取后再设置为utf-8,获取到的还是乱码;这时可以通过在工具类GetResult 中设置为gb2312的编码,就不会乱码了;
3.在获取某些网页时,会遇到403问题,被拒绝访问,这时也可以通过在工具类中的header中增加User-Agent属性来解决;
4.在解析页面中遇到的问题,比如在页面中使用JavaScript书写的标签,这就需要使用JSoup类来解决了。
5.总共有30多个省市,每个省市网页上的排版规则也不相同;现在只抓取了这一个省,就写了如此多的代码,大约两百行的代码;如果全国每个省的数据都抓取过来,页面排版变了,标签的规则也变了,一动百动,后面有的维护了,这真是一个事情,不知道有什么更好的办法?

参考文档:
https://blog.csdn.net/River_sum/article/details/82533648
https://blog.csdn.net/ITNoobie/article/details/48262785
https://www.open-open.com/jsoup/

推荐阅读更多精彩内容