学习笔记 : Java爬虫之HttpClient
简介 : HttpClient
是Apache Jakarta Common下的子项目,用于提供高效的,功能丰富的支持HTTP协议的客户编程工具包,其主要功能如下:
- 实现了所有HTTP的方法 : GET,POST,PUT,HEAD ..
- 支持自动重定向
- 支持HTTPS协议
- 支持代理服务器
实现爬虫
传统实现 : 爬取静态网页
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33
| package pers.huangyuhui.crawler.HttpClient_Demo;
import java.io.BufferedReader; import java.io.IOException; import java.io.InputStreamReader; import java.net.URL; import java.nio.charset.StandardCharsets;
public class BasicCrawler {
public static void main(String[] args) throws IOException { URL pageURL = new URL("https://yubuntu0109.github.io/"); BufferedReader reader = new BufferedReader(new InputStreamReader(pageURL.openStream(), StandardCharsets.UTF_8)); String line; StringBuilder pageBuffer = new StringBuilder(); while ((line = reader.readLine()) != null) { pageBuffer.append(line); } reader.close(); System.out.println(pageBuffer.toString()); } }
|
HttpClient实现 : 爬取静态网页
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44
| package pers.huangyuhui.crawler.HttpClient_Demo;
import org.apache.http.HttpEntity; import org.apache.http.client.methods.CloseableHttpResponse; import org.apache.http.client.methods.HttpGet; import org.apache.http.impl.client.CloseableHttpClient; import org.apache.http.impl.client.HttpClients; import org.apache.http.util.EntityUtils;
import java.io.IOException;
public class HttpClientCrawler {
public static void main(String[] args) {
HttpGet httpGet = new HttpGet("https://www.bilibili.com/"); httpGet.setHeader("User-Agent", "xxxxxx");
try (CloseableHttpClient httpClient = HttpClients.createDefault()) { try (CloseableHttpResponse response = httpClient.execute(httpGet)) { if (response.getStatusLine().getStatusCode() == 200) { HttpEntity httpEntity = response.getEntity(); String content = EntityUtils.toString(httpEntity, "utf-8");
System.out.println(content); } } } catch (IOException e) { e.printStackTrace(); } } }
|
GET请求
无参GET请求 : 爬取B站静态网页
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42
| package pers.huangyuhui.crawler.HttpClient_Get;
import org.apache.http.HttpEntity; import org.apache.http.client.methods.CloseableHttpResponse; import org.apache.http.client.methods.HttpGet; import org.apache.http.impl.client.CloseableHttpClient; import org.apache.http.impl.client.HttpClients; import org.apache.http.util.EntityUtils;
import java.io.IOException;
public class GetTest {
public static void main(String[] args) { HttpGet httpGet = new HttpGet("https://www.bilibili.com/"); httpGet.setHeader("User-Agent", "x-x-x-x-x-x");
try (CloseableHttpClient httpClient = HttpClients.createDefault()) { try (CloseableHttpResponse response = httpClient.execute(httpGet)) { if (response.getStatusLine().getStatusCode() == 200) { HttpEntity httpEntity = response.getEntity(); String content = EntityUtils.toString(httpEntity, "utf-8"); System.out.println(content); } } } catch (IOException e) { e.printStackTrace(); } } }
|
有参GET请求 : 爬取B站中搜索关键字为jsoup的网页内容
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47
| package pers.huangyuhui.crawler.HttpClient_Get;
import org.apache.http.HttpEntity; import org.apache.http.client.methods.CloseableHttpResponse; import org.apache.http.client.methods.HttpGet; import org.apache.http.client.utils.URIBuilder; import org.apache.http.impl.client.CloseableHttpClient; import org.apache.http.impl.client.HttpClients; import org.apache.http.util.EntityUtils;
import java.io.IOException; import java.net.URISyntaxException;
public class GetParamTest {
public static void main(String[] args) throws URISyntaxException, IOException {
URIBuilder uriBuilder = new URIBuilder("https://search.bilibili.com/all"); uriBuilder.setParameter("keyword", "jsoup"); HttpGet httpGet = new HttpGet(uriBuilder.build()); httpGet.setHeader("User-Agent", "x-x-x-x-x-x"); CloseableHttpClient httpClient = HttpClients.createDefault(); CloseableHttpResponse response = httpClient.execute(httpGet); if (response.getStatusLine().getStatusCode() == 200) { HttpEntity httpEntity = response.getEntity(); String content = EntityUtils.toString(httpEntity, "utf-8"); System.out.println(content); } response.close(); httpClient.close(); }
}
|
POST请求
无参POST请求 : 爬取CSDN静态网页
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44
| package pers.huangyuhui.crawler.HttpClient_Post;
import org.apache.http.HttpEntity; import org.apache.http.client.methods.CloseableHttpResponse; import org.apache.http.client.methods.HttpGet; import org.apache.http.client.methods.HttpPost; import org.apache.http.impl.client.CloseableHttpClient; import org.apache.http.impl.client.HttpClients; import org.apache.http.util.EntityUtils;
import java.io.IOException;
public class PostTest {
public static void main(String[] args) {
HttpPost httpPost = new HttpPost("https://www.csdn.net/"); httpPost.setHeader("User-Agent", "x-x-x-x-x-x");
try (CloseableHttpClient httpClient = HttpClients.createDefault()) { try (CloseableHttpResponse response = httpClient.execute(httpPost)) { if (response.getStatusLine().getStatusCode() == 200) { HttpEntity httpEntity = response.getEntity(); String content = EntityUtils.toString(httpEntity, "utf-8"); System.out.println(content); } } } catch (IOException e) { e.printStackTrace(); } } }
|
有参POST请求 : 爬取B站中搜索关键字为jsoup的网页内容(注:其并不支持POST,既只做演示)
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53
| package pers.huangyuhui.crawler.HttpClient_Post;
import org.apache.http.HttpEntity; import org.apache.http.NameValuePair; import org.apache.http.client.entity.UrlEncodedFormEntity; import org.apache.http.client.methods.CloseableHttpResponse; import org.apache.http.client.methods.HttpPost; import org.apache.http.impl.client.CloseableHttpClient; import org.apache.http.impl.client.HttpClients; import org.apache.http.message.BasicNameValuePair; import org.apache.http.util.EntityUtils;
import java.io.IOException; import java.util.ArrayList; import java.util.List;
public class PostParamTest {
public static void main(String[] args) throws IOException {
HttpPost httpPost = new HttpPost("https://search.bilibili.com/all"); List<NameValuePair> params = new ArrayList<>(); params.add(new BasicNameValuePair("keyword", "jsoup")); UrlEncodedFormEntity formEntity = new UrlEncodedFormEntity(params, "utf-8"); httpPost.setEntity(formEntity); httpPost.setHeader("User-Agent", "x-x-x-x-x-x"); CloseableHttpClient httpClient = HttpClients.createDefault(); CloseableHttpResponse response = httpClient.execute(httpPost); if (response.getStatusLine().getStatusCode() == 200) { HttpEntity httpEntity = response.getEntity(); String content = EntityUtils.toString(httpEntity, "utf-8"); System.out.println(content); } response.close(); httpClient.close();
}
}
|
HttpClient连接池
学习使用HttpClient连接池
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55
| package pers.huangyuhui.crawler.HttpClient_ConnPool;
import org.apache.http.HttpEntity; import org.apache.http.client.methods.CloseableHttpResponse; import org.apache.http.client.methods.HttpGet; import org.apache.http.impl.client.CloseableHttpClient; import org.apache.http.impl.client.HttpClients; import org.apache.http.impl.conn.PoolingHttpClientConnectionManager; import org.apache.http.util.EntityUtils;
public class PoolTest {
public static void main(String[] args) { PoolingHttpClientConnectionManager phccm = new PoolingHttpClientConnectionManager(); phccm.setMaxTotal(100); phccm.setDefaultMaxPerRoute(10);
doGet(phccm); }
private static void doGet(PoolingHttpClientConnectionManager phccm) { HttpGet httpGet = new HttpGet("https://www.bilibili.com/"); httpGet.setHeader("User-Agent", "x-x-x-x-x-x"); CloseableHttpClient httpClient = HttpClients.custom().setConnectionManager(phccm).build(); try (CloseableHttpResponse response = httpClient.execute(httpGet)) { if (response.getStatusLine().getStatusCode() == 200) { HttpEntity httpEntity = response.getEntity(); String content = EntityUtils.toString(httpEntity, "utf-8"); System.out.println(content); } } catch (Exception e) { e.printStackTrace(); } } }
|