suger7 发表于 2022-10-20 16:39:01

java爬虫之HtmlUnit介绍

前端有时候会遇到项目临时需要网上收集数据的情况,什么方案是简单易懂、长期可用的呢,当然是用浏览器终端测试单元做爬虫是最方便的啦,将平时工作中的测试程序进行简单的修改,然后配合爬虫代理,就可以马上开始数据采集,是不是很方便呀。刚好之前也分享了一篇关于java爬虫的文章,那今天也是爬虫方面的知识,我们可以继续分享下java爬虫。不知道做学java的对HtmlUnit熟悉不呢?它是是java下的一款无头浏览器方案,通过相应的API模拟HTML协议,可以请求页面,提交表单,打开链接等等操作,完全模拟用户终端。支持复杂的JavaScript、AJAX库,可以模拟多种浏览器,包括Chrome,Firefox或IE等。下面提供一个简单的demo,通过调用爬虫代理访问IP查询网站,如果将目标网站修改为需要采集的数据链接,即可获取相应的数据,再加上数据分析模块就可以基本使用,示例是根据实际项目需求写的,看下要复杂些:import java.io.BufferedReader;
import java.io.InputStreamReader;
import java.io.IOException;
import java.net.URI;
import java.util.Arrays;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Set;

import org.apache.http.Header;
import org.apache.http.HeaderElement;
import org.apache.http.HttpHost;
import org.apache.http.auth.AuthScope;
import org.apache.http.auth.UsernamePasswordCredentials;
import org.apache.http.client.AuthCache;
import org.apache.http.client.CredentialsProvider;
import org.apache.http.client.HttpRequestRetryHandler;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.config.AuthSchemes;
import org.apache.http.client.entity.GzipDecompressingEntity;
import org.apache.http.client.entity.UrlEncodedFormEntity;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.methods.HttpPost;
import org.apache.http.client.methods.HttpRequestBase;
import org.apache.http.client.protocol.HttpClientContext;
import org.apache.http.config.Registry;
import org.apache.http.config.RegistryBuilder;
import org.apache.http.conn.socket.ConnectionSocketFactory;
import org.apache.http.conn.socket.LayeredConnectionSocketFactory;
import org.apache.http.conn.socket.PlainConnectionSocketFactory;
import org.apache.http.conn.ssl.SSLConnectionSocketFactory;
import org.apache.http.impl.auth.BasicScheme;
import org.apache.http.impl.client.BasicAuthCache;
import org.apache.http.impl.client.BasicCredentialsProvider;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.impl.client.ProxyAuthenticationStrategy;
import org.apache.http.impl.conn.PoolingHttpClientConnectionManager;
import org.apache.http.message.BasicHeader;
import org.apache.http.message.BasicNameValuePair;
import org.apache.http.NameValuePair;
import org.apache.http.util.EntityUtils;

public class Demo
{
    // 代理服务器(产品官网 www.16yun.cn)
    final static String proxyHost = "t.16yun.cn";
    final static Integer proxyPort = 31000;

    // 代理验证信息
    final static String proxyUser = "username";
    final static String proxyPass = "password";




    private static PoolingHttpClientConnectionManager cm = null;
    private static HttpRequestRetryHandler httpRequestRetryHandler = null;
    private static HttpHost proxy = null;

    private static CredentialsProvider credsProvider = null;
    private static RequestConfig reqConfig = null;

    static {
      ConnectionSocketFactory plainsf = PlainConnectionSocketFactory.getSocketFactory();
      LayeredConnectionSocketFactory sslsf = SSLConnectionSocketFactory.getSocketFactory();

      Registry registry = RegistryBuilder.create()
            .register("http", plainsf)
            .register("https", sslsf)
            .build();

      cm = new PoolingHttpClientConnectionManager(registry);
      cm.setMaxTotal(20);
      cm.setDefaultMaxPerRoute(5);

      proxy = new HttpHost(proxyHost, proxyPort, "http");

      credsProvider = new BasicCredentialsProvider();
      credsProvider.setCredentials(AuthScope.ANY, new UsernamePasswordCredentials(proxyUser, proxyPass));

      reqConfig = RequestConfig.custom()
            .setConnectionRequestTimeout(5000)
            .setConnectTimeout(5000)
            .setSocketTimeout(5000)
            .setExpectContinueEnabled(false)
            .setProxy(new HttpHost(proxyHost, proxyPort))
            .build();
    }

    public static void doRequest(HttpRequestBase httpReq) {
      CloseableHttpResponse httpResp = null;

      try {
            setHeaders(httpReq);

            httpReq.setConfig(reqConfig);

            CloseableHttpClient httpClient = HttpClients.custom()
                .setConnectionManager(cm)
                .setDefaultCredentialsProvider(credsProvider)
                .build();

            //设置TCP keep alive,访问https网站时保持IP不切换
            // SocketConfig socketConfig = SocketConfig.custom().setSoKeepAlive(true).setSoTimeout(3600000).build();
            // CloseableHttpClient httpClient =HttpClients.custom()
            //    .setConnectionManager(cm)
            //    .setDefaultCredentialsProvider(credsProvider)
            //    .setDefaultSocketConfig(socketConfig)
            //    .build();


            AuthCache authCache = new BasicAuthCache();
            authCache.put(proxy, new BasicScheme());
            // 如果遇到407,可以设置代理认证 Proxy-Authenticate
            // authCache.put(proxy, new BasicScheme(ChallengeState.PROXY));

            HttpClientContext localContext = HttpClientContext.create();
            localContext.setAuthCache(authCache);

            httpResp = httpClient.execute(httpReq, localContext);

            int statusCode = httpResp.getStatusLine().getStatusCode();

            System.out.println(statusCode);

            BufferedReader rd = new BufferedReader(new InputStreamReader(httpResp.getEntity().getContent()));

            String line = "";
            while((line = rd.readLine()) != null) {
                System.out.println(line);
            }
      } catch (Exception e) {
            e.printStackTrace();
      } finally {
            try {
                if (httpResp != null) {
                  httpResp.close();
                }
            } catch (IOException e) {
                e.printStackTrace();
            }
      }
    }

    /**
   * 设置请求头
   *
   * @param httpReq
   */
    private static void setHeaders(HttpRequestBase httpReq) {

      // 设置Proxy-Tunnel
      // Random random = new Random();
      // int tunnel = random.nextInt(10000);
      // httpReq.setHeader("Proxy-Tunnel", String.valueOf(tunnel));

      httpReq.setHeader("Accept-Encoding", null);

    }


    public static void doGetRequest() {
      // 要访问的目标页面
      String targetUrl = "https://httpbin.org/ip";


      try {
            HttpGet httpGet = new HttpGet(targetUrl);

            doRequest(httpGet);
      } catch (Exception e) {
            e.printStackTrace();
      }
    }

    public static void main(String[] args) {
      doGetRequest();


    }
}示例参考来源于亿牛云,因之前的业务需求购买了代理,一直都还在使用。刚好分享这篇文章就一起分享给大家了,在代理方面有需求的可以试试他们家提供的隧道代理,是我使用众多代理商里面IP质量好,售后服务也最好的一家。代理的详细介


zchzzz 发表于 2022-11-25 14:15:57

12312312
页: [1]
查看完整版本: java爬虫之HtmlUnit介绍