HtmlUnit爬虫获取页面信息

HtmlUnit是一个没有GUI的模拟浏览器测试框架，之前在Python使用过Slenium进行自动化测试和一些爬虫的操作。但是Slenium在使用的时候会出现一个浏览器界面进行真实操作。

HtmlUnit可能在编排代码的时候不知道浏览器现在是什么情况，可能会有很难写下去的情况。

POM中引入HtmlUnit

<!-- https://mvnrepository.com/artifact/net.sourceforge.htmlunit/htmlunit -->
<dependency>
    <groupId>net.sourceforge.htmlunit</groupId>
    <artifactId>htmlunit</artifactId>
    <version>2.70.0</version>
</dependency>

编写代码

package org.example;

import com.gargoylesoftware.htmlunit.BrowserVersion;
import com.gargoylesoftware.htmlunit.NicelyResynchronizingAjaxController;
import com.gargoylesoftware.htmlunit.WebClient;
import com.gargoylesoftware.htmlunit.html.HtmlElement;
import com.gargoylesoftware.htmlunit.html.HtmlPage;
import org.apache.http.HttpHost;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.protocol.HttpClientContext;
import org.apache.http.client.utils.URIUtils;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;

import java.io.IOException;
import java.net.URI;
import java.util.List;

public class JsoupHttpClient {
    public static void main(String[] args) throws Exception {
        WebClient webClient = getWebClient();
        // 短链接变成长链接
        String url = getAbsUrl("https://nnfp.jss.com.cn/scan-invoice/printQrcode?paramList=xxxxx&aliView=true");
        HtmlPage page = webClient.getPage(url);
        // 有的时候需要等待一会js的代码执行
		// webClient.waitForBackgroundJavaScriptStartingBefore(30000);
        // 定位到页面下载PDF的按钮
        List<HtmlElement> spanList = page.getByXPath("//div[@class='logo-btn-area']/a[@class='primary']");
        HtmlElement htmlElement = spanList.get(0);
        String string = htmlElement.click().getUrl().toString();
        System.out.println(string);

    }

    private static WebClient getWebClient() {
        WebClient webClient = new WebClient(BrowserVersion.EDGE);
//		webClient.getOptions().setCssEnabled(true);//（启用)css 正常来说css不会影响到爬虫数据抓取，但是诺诺发票关闭css就会出现问题
//		webClient.getOptions().setThrowExceptionOnScriptError(false);//（屏蔽)异常
//		webClient.getOptions().setThrowExceptionOnFailingStatusCode(false);//（屏蔽)日志
//		webClient.getOptions().setJavaScriptEnabled(true);//加载js脚本
//		webClient.getOptions().setUseInsecureSSL(true);//接受任何主机连接 无论是否有有效证书
//		webClient.getOptions().setTimeout(50000);//设置超时时间
        webClient.setAjaxController(new NicelyResynchronizingAjaxController());//设置ajax
        return webClient;
    }
    
     /**
     * 处理跳转链接，获取重定向地址
     *
     * @param url 源地址
     * @return 目标网页的绝对地址
     */
    public static String getAbsUrl(String url) {
        CloseableHttpClient httpclient = HttpClients.createDefault();
        HttpClientContext context = HttpClientContext.create();
        HttpGet httpget = new HttpGet(url);
        CloseableHttpResponse response = null;
        String absUrl = null;
        try {
            response = httpclient.execute(httpget, context);
            HttpHost target = context.getTargetHost();
            List<URI> redirectLocations = context.getRedirectLocations();
            // System.out.println("httpget.getURI()：" + httpget.getURI());
            URI location = URIUtils.resolve(httpget.getURI(), target, redirectLocations);
            // System.out.println("Final HTTP location: " +
            // location.toASCIIString());
            absUrl = location.toASCIIString();
        } catch (IOException e) {
            e.printStackTrace();
        } catch (Exception e) {
            e.printStackTrace();
        } finally {
            try {
                httpclient.close();
                response.close();
            } catch (IOException e) {
                e.printStackTrace();
            }
        }
        return absUrl;
    }
}

代码中先使用了getAbsUrl方法，把对应的短链接变成长链接，然后获取浏览器连接，通过XPATH定位的方法//div[@class='logo-btn-area']/a[@class='primary']定位到页面下载PDF的按钮，然后点击下载