JAVA爬虫

2018-06-18 01:25:55来源:未知 阅读 ()

新老客户大回馈,云服务器低至5折

基于httpclient和jsoup获取网页对象和解析,使用了idea工具,spring boot框架

在网上找到的GECCO爬虫框架,使用的源码,有问题可以在github上找到gecco框架

 

1.

Requestor 获取网页对象,以封装

 

package com.example.demo.httpclient;

import org.apache.http.*;
import org.apache.http.client.ClientProtocolException;
import org.apache.http.client.CookieStore;
import org.apache.http.client.config.CookieSpecs;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.entity.UrlEncodedFormEntity;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.methods.HttpPost;
import org.apache.http.client.methods.HttpRequestBase;
import org.apache.http.client.protocol.HttpClientContext;
import org.apache.http.config.Registry;
import org.apache.http.config.RegistryBuilder;
import org.apache.http.cookie.CookieSpecProvider;
import org.apache.http.impl.client.BasicCookieStore;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.impl.cookie.BasicClientCookie;
import org.apache.http.impl.cookie.DefaultCookieSpecProvider;
import org.apache.http.message.BasicNameValuePair;
import org.apache.http.util.EntityUtils;

import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.Set;

public class Requestor {

protected final static String USER_AGENT = "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.94 Safari/537.36";
protected final static String Accept = "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8";
protected final static String AcceptLanguage = "zh-CN,zh;q=0.8,en;q=0.6";

protected CookieStore cookieStore;
protected HttpClientContext context;
protected CloseableHttpClient client;

public Requestor() {
client = HttpClients.createDefault();
}

public void doLogin(String loginUrl, Map<String, String> params) {
HttpResponse httpResponse = doPost(loginUrl, params);
printResponse(httpResponse);
}

public void printResponse(HttpResponse httpResponse) {
// 获取响应消息实体
HttpEntity entity = httpResponse.getEntity();
// 响应状态
System.out.println("status:" + httpResponse.getStatusLine());
System.out.println("headers:");
HeaderIterator iterator = httpResponse.headerIterator();
while (iterator.hasNext()) {
System.out.println("\t" + iterator.next());
}
// 判断响应实体是否为空
if (entity != null) {
String responseString;
try {
responseString = EntityUtils.toString(entity);
System.out.println("response length:" + responseString.length());
System.out.println("response content:" + responseString.replace("\r\n", ""));
} catch (org.apache.http.ParseException | IOException e) {
e.printStackTrace();
}
}
}

public void setContext() {
context = HttpClientContext.create();
Registry<CookieSpecProvider> registry = RegistryBuilder.<CookieSpecProvider> create()
.register(CookieSpecs.DEFAULT, new DefaultCookieSpecProvider()).register(CookieSpecs.DEFAULT, new DefaultCookieSpecProvider())
.build();
context.setCookieSpecRegistry(registry);
context.setCookieStore(cookieStore);
}

public void setCookieStore(HttpResponse httpResponse) {
cookieStore = new BasicCookieStore();
Header[] headers = httpResponse.getHeaders("Set-Cookie");

String cookieValue = null;
for (Header header : headers) {
System.out.println(header.getName() + ":" + header.getValue());
cookieValue = header.getValue();
}
// 新建一个Cookie
BasicClientCookie cookie = new BasicClientCookie("oscid", cookieValue);
cookie.setDomain(".oschina.net");
cookie.setPath("/");
cookieStore.addCookie(cookie);
}

public List<NameValuePair> getParam(Map<String, String> parameterMap) {
List<NameValuePair> nameValuePairs = new ArrayList<NameValuePair>();
Set<String> keySet = parameterMap.keySet();
for (String key : keySet) {
nameValuePairs.add(new BasicNameValuePair(key, parameterMap.get(key)));
}
return nameValuePairs;
}

public HttpResponse doGet(String url) {
HttpResponse result = null;
HttpGet httpGet = new HttpGet(url);
config(httpGet);
try {
result = client.execute(httpGet);
} catch (IOException e) {
e.printStackTrace();
}
return result;
}

public HttpResponse doPost(String url, Map<String, String> params) {
HttpResponse result = null;
HttpPost httpPost = new HttpPost(url);
UrlEncodedFormEntity postEntity;
try {
postEntity = new UrlEncodedFormEntity(getParam(params), "UTF-8");
httpPost.setEntity(postEntity);
config(httpPost);
result = client.execute(httpPost);
} catch (UnsupportedEncodingException e) {
e.printStackTrace();
} catch (ClientProtocolException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
return result;
}

protected void config(HttpRequestBase httpRequestBase) {
httpRequestBase.setHeader("User-Agent", USER_AGENT);
httpRequestBase.setHeader("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8");
httpRequestBase.setHeader("Accept-Language", "zh-CN,zh;q=0.8,en;q=0.6");
httpRequestBase.setHeader("Referer", "https://www.oschina.net/home/login?goto_page=http%3A%2F%2Fwww.oschina.net%2F");
// 配置请求的超时设置
RequestConfig requestConfig = RequestConfig.custom().setConnectionRequestTimeout(300000).setConnectTimeout(300000).setSocketTimeout(300000).build();
httpRequestBase.setConfig(requestConfig);
}

}

2.
AbstractClient类
package com.example.demo.httpclient;

import org.apache.http.Header;
import org.apache.http.HttpResponse;
import org.apache.http.NameValuePair;
import org.apache.http.client.ClientProtocolException;
import org.apache.http.client.CookieStore;
import org.apache.http.client.config.CookieSpecs;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.entity.UrlEncodedFormEntity;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.methods.HttpPost;
import org.apache.http.client.methods.HttpRequestBase;
import org.apache.http.client.protocol.HttpClientContext;
import org.apache.http.config.Registry;
import org.apache.http.config.RegistryBuilder;
import org.apache.http.cookie.CookieSpecProvider;
import org.apache.http.impl.client.BasicCookieStore;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.impl.cookie.BasicClientCookie;
import org.apache.http.impl.cookie.DefaultCookieSpecProvider;
import org.apache.http.message.BasicNameValuePair;

import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.Set;

public class AbstractClient {

protected final static String USER_AGENT = "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.94 Safari/537.36";
protected final static String Accept = "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8";
protected final static String AcceptLanguage = "zh-CN,zh;q=0.8,en;q=0.6";

protected CookieStore cookieStore;
protected HttpClientContext context;
protected CloseableHttpClient client;

public AbstractClient() {
client = HttpClients.createDefault();
}

public void setContext() {
context = HttpClientContext.create();
Registry<CookieSpecProvider> registry = RegistryBuilder.<CookieSpecProvider> create()
.register(CookieSpecs.DEFAULT, new DefaultCookieSpecProvider())
.register(CookieSpecs.DEFAULT, new DefaultCookieSpecProvider()).build();
context.setCookieSpecRegistry(registry);
context.setCookieStore(cookieStore);
}

public void setCookieStore(HttpResponse httpResponse) {
cookieStore = new BasicCookieStore();
Header[] headers = httpResponse.getHeaders("Set-Cookie");

String cookieValue = null;
for (Header header : headers) {
System.out.println(header.getName() + ":" + header.getValue());
cookieValue = header.getValue();
}
// 新建一个Cookie
BasicClientCookie cookie = new BasicClientCookie("oscid", cookieValue);
cookie.setDomain(".oschina.net");
cookie.setPath("/");
cookieStore.addCookie(cookie);
}

public List<NameValuePair> getParam(Map<String, String> parameterMap) {
List<NameValuePair> nameValuePairs = new ArrayList<NameValuePair>();
Set<String> keySet = parameterMap.keySet();
for (String key : keySet) {
nameValuePairs.add(new BasicNameValuePair(key, parameterMap.get(key)));
}
return nameValuePairs;
}

public HttpResponse doGet(String url) {
HttpResponse result = null;
HttpGet httpGet = new HttpGet(url);
config(httpGet);
try {
result = client.execute(httpGet);
} catch (IOException e) {
e.printStackTrace();
}
return result;
}

public HttpResponse doPost(String url, Map<String, String> params) {
HttpResponse result = null;
HttpPost httpPost = new HttpPost(url);
UrlEncodedFormEntity postEntity;
try {
postEntity = new UrlEncodedFormEntity(getParam(params), "UTF-8");
httpPost.setEntity(postEntity);
config(httpPost);
result = client.execute(httpPost);
} catch (UnsupportedEncodingException e) {
e.printStackTrace();
} catch (ClientProtocolException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
return result;
}

protected void config(HttpRequestBase httpRequestBase) {
httpRequestBase.setHeader("User-Agent", USER_AGENT);
httpRequestBase.setHeader("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8");
httpRequestBase.setHeader("Accept-Language", "zh-CN,zh;q=0.8,en;q=0.6");
// 配置请求的超时设置
RequestConfig requestConfig = RequestConfig.custom().setConnectionRequestTimeout(3000).setConnectTimeout(3000)
.setSocketTimeout(3000).build();
httpRequestBase.setConfig(requestConfig);
}

}

3.使用--参照用法
package com.example.demo.getpage;

import com.example.demo.entity.CarBrand;
import com.example.demo.entity.CarDemio;
import com.example.demo.entity.CarVehicle;
import com.alibaba.fastjson.JSONArray;
import com.example.demo.httpclient.Requestor;
import org.apache.http.HttpEntity;
import org.apache.http.HttpResponse;
import org.apache.http.util.EntityUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;

public class CarGet {

private Requestor requestor = new Requestor();

/**
* 获取汽车品牌
* @return
*/
public List<CarBrand> getCarBrands() {
List<CarBrand> carBrands=new ArrayList<CarBrand>();
try {
Document Alldocument = Jsoup.connect("https://www.che300.com/?from=bd_seo&city=11").get();
// String title=Alldocument.title();
Element elementDiv = Alldocument.getElementsByAttributeValue("class", "ucarselecttype_pinpaibottom_ul brand").first();
Elements links = elementDiv.getElementsByTag("p");
String[] chars = {"Q", "W", "E", "R", "T", "Y", "U",
"I", "O", "P", "A", "S", "D", "F", "G", "H", "J", "K", "L", "Z", "X", "C", "V", "B", "N", "M"};
List<String> clist = Arrays.asList(chars);
for (Element e : links) {
//判断id是否是A-Z 是就排除
if (!clist.contains(e.id())) {
CarBrand cb = new CarBrand();
cb.setSeries_brand(e.id());
cb.setBrand_name(e.html());
cb.setRel(e.attr("rel"));
carBrands.add(cb);
}
// System.out.println(e.html()+"---"+e.attr("rel")+"---"+e.id());
}
// for (CarBrand cb:carBrands
// ) {
// System.out.println(cb);
// }
// System.out.println(title);
} catch (IOException e) {
e.printStackTrace();
}finally {
return carBrands;
}

}


//获取某一个品牌下的所有系列
//接口地址
private List<CarDemio> getOneCarDemio(String url){
List<CarDemio> cds = new ArrayList<CarDemio>();
try {
//接口地址
// String url="https://ssl-meta.che300.com/meta/series/series_brand{0}.json?v=159";
HttpResponse response = requestor.doGet(url);
HttpEntity entity = response.getEntity();
String str =EntityUtils.toString(entity);
JSONArray array = JSONArray.parseArray(str);
cds=array.toJavaList(CarDemio.class);
} catch (IOException e) {
e.printStackTrace();
}finally {
return cds;
}
}


/**
* 获取汽车系列
* @param carBrands
* @return
*/
public List<CarDemio> getCarDemio(List<CarBrand> carBrands) {
List<CarDemio> carDemios=new ArrayList<CarDemio>();
for (CarBrand cb : carBrands) {
String url="https://ssl-meta.che300.com/meta/series/series_brand"+cb.getSeries_brand()+".json?v=159";
List<CarDemio> cars=this.getOneCarDemio(url);
carDemios.addAll(cars);
}
return carDemios;
}

//获取单个系列下的汽车类型
private List<CarVehicle> getOneCarVhicle(String url){
List<CarVehicle> carVehicleList =new ArrayList<CarVehicle>();
try {
HttpResponse response = requestor.doGet(url);
HttpEntity entity = response.getEntity();
String str =EntityUtils.toString(entity);
JSONArray array = JSONArray.parseArray(str);
carVehicleList=array.toJavaList(CarVehicle.class);
} catch (IOException e) {
e.printStackTrace();
}finally {
return carVehicleList;
}
}

/**
* 获取所有汽车品种
* @param carDemios
* @return
*/
public List<CarVehicle> getCarVehicles(List<CarDemio> carDemios){
List<CarVehicle> carVehicles = new ArrayList<CarVehicle>();
for (CarDemio cd : carDemios) {
String url="https://ssl-meta.che300.com/meta/model/model_series"+cd.getSeries_id()+".json?v=159";
List<CarVehicle> vehicleList=this.getOneCarVhicle(url);
carVehicles.addAll(vehicleList);
}
return carVehicles;

}
}

4.主要用法
private Requestor requestor = new Requestor();

/**
* 通过链接获取json格式的值
* 汽车品牌
* @throws Exception
*/
@Test
public void testVisitBlog() throws Exception {
HttpResponse response = requestor.doGet(testUrl);
HttpEntity entity = response.getEntity();
String str =EntityUtils.toString(entity);
JSONArray array = JSONArray.parseArray(str);
List<CarDemio> carDemioList=array.toJavaList(CarDemio.class);
for (CarDemio c:carDemioList
) {
System.out.println(c);
}
}
控制台打印:

CarDemio{series_id='2476', series_group_name='知豆电动车', series_name='知豆', is_green='1'}
CarDemio{series_id='2477', series_group_name='知豆电动车', series_name='知豆D1', is_green='1'}
CarDemio{series_id='2478', series_group_name='知豆电动车', series_name='知豆D2', is_green='1'}
CarDemio{series_id='33135', series_group_name='知豆电动车', series_name='知豆D3', is_green='1'}

 

5.jar包

<!-- https://mvnrepository.com/artifact/com.google.code.gson/gson -->
<dependency>
<groupId>com.google.code.gson</groupId>
<artifactId>gson</artifactId>
<version>2.8.2</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.jsoup/jsoup -->
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.8.3</version>
</dependency>

<dependency>
   <groupId>org.apache.httpcomponents</groupId>
<artifactId>httpclient</artifactId>
<version>4.5.2</version>
</dependency>

<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>4.12</version>
</dependency>
<!-- https://mvnrepository.com/artifact/com.alibaba/fastjson -->
<dependency>
<groupId>com.alibaba</groupId>
<artifactId>fastjson</artifactId>
<version>1.2.47</version>
</dependency>







标签:

版权申明:本站文章部分自网络,如有侵权,请联系:west999com@outlook.com
特别注意:本站所有转载文章言论不代表本站观点,本站所提供的摄影照片,插画,设计作品,如需使用,请与原作者联系,版权归原作者所有

上一篇:Spring MVC No converter found for return value of type 解决

下一篇:Spring Boot开发MongoDB应用实践