HtmlUnit是一个没有GUI的模拟浏览器测试框架,之前在Python使用过Slenium进行自动化测试和一些爬虫的操作。但是Slenium在使用的时候会出现一个浏览器界面进行真实操作。

HtmlUnit可能在编排代码的时候不知道浏览器现在是什么情况,可能会有很难写下去的情况。

  1. POM中引入HtmlUnit

    <!-- https://mvnrepository.com/artifact/net.sourceforge.htmlunit/htmlunit -->
    <dependency>
    <groupId>net.sourceforge.htmlunit</groupId>
    <artifactId>htmlunit</artifactId>
    <version>2.70.0</version>
    </dependency>
  2. 编写代码

    package org.example;

    import com.gargoylesoftware.htmlunit.BrowserVersion;
    import com.gargoylesoftware.htmlunit.NicelyResynchronizingAjaxController;
    import com.gargoylesoftware.htmlunit.WebClient;
    import com.gargoylesoftware.htmlunit.html.HtmlElement;
    import com.gargoylesoftware.htmlunit.html.HtmlPage;
    import org.apache.http.HttpHost;
    import org.apache.http.client.methods.CloseableHttpResponse;
    import org.apache.http.client.methods.HttpGet;
    import org.apache.http.client.protocol.HttpClientContext;
    import org.apache.http.client.utils.URIUtils;
    import org.apache.http.impl.client.CloseableHttpClient;
    import org.apache.http.impl.client.HttpClients;

    import java.io.IOException;
    import java.net.URI;
    import java.util.List;

    public class JsoupHttpClient {
    public static void main(String[] args) throws Exception {
    WebClient webClient = getWebClient();
    // 短链接变成长链接
    String url = getAbsUrl("https://nnfp.jss.com.cn/scan-invoice/printQrcode?paramList=xxxxx&aliView=true");
    HtmlPage page = webClient.getPage(url);
    // 有的时候需要等待一会js的代码执行
    // webClient.waitForBackgroundJavaScriptStartingBefore(30000);
    // 定位到页面下载PDF的按钮
    List<HtmlElement> spanList = page.getByXPath("//div[@class='logo-btn-area']/a[@class='primary']");
    HtmlElement htmlElement = spanList.get(0);
    String string = htmlElement.click().getUrl().toString();
    System.out.println(string);

    }

    private static WebClient getWebClient() {
    WebClient webClient = new WebClient(BrowserVersion.EDGE);
    // webClient.getOptions().setCssEnabled(true);//(启用)css 正常来说css不会影响到爬虫数据抓取,但是诺诺发票关闭css就会出现问题
    // webClient.getOptions().setThrowExceptionOnScriptError(false);//(屏蔽)异常
    // webClient.getOptions().setThrowExceptionOnFailingStatusCode(false);//(屏蔽)日志
    // webClient.getOptions().setJavaScriptEnabled(true);//加载js脚本
    // webClient.getOptions().setUseInsecureSSL(true);//接受任何主机连接 无论是否有有效证书
    // webClient.getOptions().setTimeout(50000);//设置超时时间
    webClient.setAjaxController(new NicelyResynchronizingAjaxController());//设置ajax
    return webClient;
    }

    /**
    * 处理跳转链接,获取重定向地址
    *
    * @param url 源地址
    * @return 目标网页的绝对地址
    */
    public static String getAbsUrl(String url) {
    CloseableHttpClient httpclient = HttpClients.createDefault();
    HttpClientContext context = HttpClientContext.create();
    HttpGet httpget = new HttpGet(url);
    CloseableHttpResponse response = null;
    String absUrl = null;
    try {
    response = httpclient.execute(httpget, context);
    HttpHost target = context.getTargetHost();
    List<URI> redirectLocations = context.getRedirectLocations();
    // System.out.println("httpget.getURI():" + httpget.getURI());
    URI location = URIUtils.resolve(httpget.getURI(), target, redirectLocations);
    // System.out.println("Final HTTP location: " +
    // location.toASCIIString());
    absUrl = location.toASCIIString();
    } catch (IOException e) {
    e.printStackTrace();
    } catch (Exception e) {
    e.printStackTrace();
    } finally {
    try {
    httpclient.close();
    response.close();
    } catch (IOException e) {
    e.printStackTrace();
    }
    }
    return absUrl;
    }
    }

代码中先使用了getAbsUrl方法,把对应的短链接变成长链接,然后获取浏览器连接,通过XPATH定位的方法//div[@class='logo-btn-area']/a[@class='primary']定位到页面下载PDF的按钮,然后点击下载

image-20240118135434494

image-20240118135552362