赞
踩
我们需要使用eclipse通过编写java代码,爬取拉勾网中职位的数据
准备hadoop-2.7.3.tar.gz,eclipse,hadoop-eclipse-plugin-2.7.3.jar,hadoop.dll,winutile.exe
hadoop.dll,winutile.exe等等相关文件
链接:https://pan.baidu.com/s/1DnTw3lChFJy_fRfkKXInBg
提取码:xzyp
hadoop-2.7.3.tar.gz:链接:https://pan.baidu.com/s/1I1FvgICCyeBURzGx62l4QA
提取码:xzyp
下载解压好maven
配置环境变量自己在网上随便一搜就ok不在阐述
在目录下新建一个仓库repository
在conf的settings.xml中配置localRepository
Win+R输入cmd
mvn help:system
输入这条命令,如果成功后你的repository会多一些文件
在eclipse中配置maven
打开eclipse点击window->preferences
这样maven就配置好了
创建一个maven项目
File->new->other->maven->maven project
这是创建完成后的文件目录
在pom.xml中添加一下代码
- <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
- <modelVersion>4.0.0</modelVersion>
- <groupId>com.itcast.jobcase</groupId>
- <artifactId>jobcase-l</artifactId>
- <version>0.0.1-SNAPSHOT</version>
- <dependencies>
- <dependency>
- <groupId>org.apache.httpcomponents</groupId>
- <artifactId>httpclient</artifactId>
- <version>4.5.4</version>
- </dependency>
- <dependency>
- <groupId>jdk.tools</groupId>
- <artifactId>jdk.tools</artifactId>
- <version>1.8</version>
- <scope>system</scope>
- <systemPath>C:/Program Files/Java/jdk1.8.0_341/lib/tools.jar</systemPath>
- </dependency>
- <dependency>
- <groupId>org.apache.hadoop</groupId>
- <artifactId>hadoop-common</artifactId>
- <version>2.7.3</version>
- </dependency>
- <dependency>
- <groupId>org.apache.hadoop</groupId>
- <artifactId>hadoop-client</artifactId>
- <version>2.7.3</version>
- </dependency>
- </dependencies>
- </project>

完成后的项目目录:
在src/main/java下创建一个com.positon.l的包
创建四个类
第一个:HttpClientData.java
- package com.position.l;
-
- import java.util.HashMap;
- import java.util.Map;
-
- public class HttpClientData {
- public static void main(String[] args) throws Exception {//设置请求头
- Map<String, String> headers = new HashMap<String, String>();
- headers.put("Cookie","RECOMMEND_TIP=true; user_trace_token=20230509172245-850b8329-0db6-49d5-8ee5-788463473366; LGUID=20230509172245-ee291504-af55-4823-8b8f-da7830adea64; _ga=GA1.2.1941570256.1683624167; index_location_city=%E5%85%A8%E5%9B%BD; _gid=GA1.2.744431736.1684134362; privacyPolicyPopup=false; __lg_stoken__=00ef87c190275da025cc19a93d14d5da80c4c3ff29516c88d738dd7350f8601ae184994af7785dc2260517aa65b80ae0048d5bdb5ea64e76bf2b4df769b1de46bfa3cc6bd487; SEARCH_ID=c6e9d66fa6f64d48874952a58bf47660; gate_login_token=v1####da9e29af0db73d825a22a9a882bf9ddbb316eae052443e729b53cab3f19a8e70; LG_HAS_LOGIN=1; hasDeliver=0; __SAFETY_CLOSE_TIME__26120270=1; JSESSIONID=ABAAABAABEIABCI02041966E510E7120309F7B2F34013BF; WEBTJ-ID=20230515193746-1881f33c337109-00aed6e8f8a02b-7b515477-1327104-1881f33c33814dd; _putrc=743692222AE66441123F89F2B170EADC; login=true; unick=%E7%94%A8%E6%88%B77560; showExpriedIndex=1; showExpriedCompanyHome=1; showExpriedMyPublish=1; Hm_lvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1683624167,1684134362,1684150667; sensorsdata2015session=%7B%7D; X_HTTP_TOKEN=d5afe4428dfdf76486605148610ad9240e30d415e3; Hm_lpvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1684150668; TG-TRACK-CODE=index_zhaopin; LGRID=20230515193751-230d34f5-3f31-4ddd-8bb0-23d58400b756; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%2226120270%22%2C%22first_id%22%3A%22187ffd2094fb06-0d35cf63f3901a-7b515477-1327104-187ffd20950ca3%22%2C%22props%22%3A%7B%22%24latest_traffic_source_type%22%3A%22%E8%87%AA%E7%84%B6%E6%90%9C%E7%B4%A2%E6%B5%81%E9%87%8F%22%2C%22%24latest_search_keyword%22%3A%22%E6%9C%AA%E5%8F%96%E5%88%B0%E5%80%BC%22%2C%22%24latest_referrer%22%3A%22https%3A%2F%2Fcn.bing.com%2F%22%2C%22%24os%22%3A%22Windows%22%2C%22%24browser%22%3A%22Chrome%22%2C%22%24browser_version%22%3A%22113.0.0.0%22%7D%2C%22%24device_id%22%3A%22187ffd2094fb06-0d35cf63f3901a-7b515477-1327104-187ffd20950ca3%22%7D");
- headers.put("Connection","keep-alive");
- headers.put("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7");
- headers.put("Accept-Language","zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6");
- headers.put("User-Agent","Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N)"
- +"ppleWebKit/537.36 (KHTML, like Gecko) "
- +"Chrome/113.0.0.0 Mobile Safari/537.36 Edg/113.0.1774.42");
- headers.put("Content-Type", "text/html; charset=utf-8");
- headers.put("Referer","https://www.lagou.com/jobs/list_%E5%A4%A7%E6%95%B0%E6%8D%AE/p-city_0?&cl=false&fromSearch=true&labelWords=&suginput=");
- headers.put("Origin", "https://www.lagou.com");
- headers.put("X-Requested-With", "XMLHttpRequest");
- headers.put("X-Anit-Forge-Token", "None");
- headers.put("Cache-Control","no-cache");
- headers.put("X-Anit-Forge-Code","0");
- headers.put("Host","www.lagou.com");
- Map<String, String> params = new HashMap<String, String>();
- params.put("kd", "大数据");
- params.put("city", "全国");
- for (int i = 1; i < 31; i++) {
- params.put("pn", String.valueOf(i));
- }
- for (int i = 1; i < 31; i++){
- params.put("pn", String.valueOf(i));
- HttpClientResp result = HttpClientUtils.doPost("https://www.lagou.com/jobs/positionAjax.json?"+ "needAddtionalResult=false&first=true&px=default",headers,params);
- HttpClientHdfsUtils.createFileBySysTime("hdfs://192.168.25.128:9000","page"+i,result.toString());
- Thread.sleep(1 * 500);}}
- }

如果数据爬不出来,那就是请求头有问题,你需要去拉勾网注册一个账号,然后获取自己的请求头即可爬取数据;
第二个:HttpClientHdfsUtils .java
- package com.position.l;
-
- import java.io.ByteArrayInputStream;
- import java.io.IOException;
- import java.net.URI;
- import java.text.SimpleDateFormat;
- import java.util.Calendar;
- import java.util.Date;
-
- import org.apache.hadoop.conf.Configuration;
- import org.apache.hadoop.fs.FSDataOutputStream;
- import org.apache.hadoop.fs.FileSystem;
- import org.apache.hadoop.fs.Path;
- import org.apache.hadoop.io.IOUtils;
-
- public class HttpClientHdfsUtils {
- public static void createFileBySysTime(String url,String fileName,String data) {
- //指定操作HDFS的用户
- System.setProperty("HADOOP_USER_NAME", "root");
- Path path = null;
- //读取系统时间
- Calendar calendar = Calendar.getInstance();
- Date time = calendar.getTime();
- //格式化系统时间为年月日的形式
- SimpleDateFormat format = new SimpleDateFormat("yyyyMMdd");
- //获取系统当前的时间并将其转化为String类型,filename即存储数据的文件夹名称
- String filePath = format.format(time);
- //构造Configuration对象,配置hadoop参数
- Configuration conf = new Configuration();
- //实例化URI引入uri
- URI uri = URI.create(url);
- //实例化FileSystem对象,处理文件和目录相关的事务
- FileSystem fileSystem;
- try {
- //获取文件系统对象
- fileSystem = FileSystem.get(uri,conf);
- //定义文件路径
- path = new Path("/success/"+filePath);
- //判断路径是否为空
- if (!fileSystem.exists(path)) {
- //创建目录
- fileSystem.mkdirs(path);
- }
- //在指定目录下创建文件
- FSDataOutputStream fsDataOutputStream = fileSystem.create(
- new Path(path.toString()+"/"+fileName));
- //向文件中写入数据
- IOUtils.copyBytes(new ByteArrayInputStream(data.getBytes()),
- fsDataOutputStream, conf, true);
- //关闭连接释放资源
- fileSystem.close();
- } catch (IOException e) {
- e.printStackTrace();}}}

第三个:HttpClientResp.java
- package com.position.l;
-
- import java.io.Serializable;
- public class HttpClientResp implements Serializable {
- private static final long serialVersionUID = -2224539827395038194L;
- //响应状态码
- private int code;
- //响应数据
- private String content;
- //空参构造 快捷键是alt+shift+S
- public HttpClientResp() {
-
- }
-
- public HttpClientResp(int code) {
- super();
- this.code = code;
- }
-
-
- public HttpClientResp(String content) {
- this.content = content;
- }
-
- public HttpClientResp(int code, String content) {
- this.code = code;
- this.content = content;
- }
- //getter和setter方法
- public int getCode() {
- return code;
- }
- public void setCode(int code) {
- this.code = code;
- }
-
- public String getContent() {
- return content;
- }
-
- public void setContent(String content) {
- this.content = content;
- }
- //重写tostring方法
- @Override
- public String toString() {
- return "HttpClientResp [code=" + code + ", content=" + content + "]";
- }
- }

第四个:HttpClientUtils .
- package com.position.l;
-
- import java.io.IOException;
- import java.io.UnsupportedEncodingException;
- import java.util.ArrayList;
- import java.util.List;
- import java.util.Map;
- import java.util.Map.Entry;
- import java.util.Set;
-
- import org.apache.commons.httpclient.HttpStatus;
- import org.apache.http.NameValuePair;
- import org.apache.http.client.config.RequestConfig;
- import org.apache.http.client.entity.UrlEncodedFormEntity;
- import org.apache.http.client.methods.CloseableHttpResponse;
- import org.apache.http.client.methods.HttpEntityEnclosingRequestBase;
- import org.apache.http.client.methods.HttpPost;
- import org.apache.http.client.methods.HttpRequestBase;
- import org.apache.http.impl.client.CloseableHttpClient;
- import org.apache.http.impl.client.HttpClients;
- import org.apache.http.message.BasicNameValuePair;
- import org.apache.http.util.EntityUtils;
-
- public class HttpClientUtils {
- // 编码格式,发送编码格式统一用UTF-8
- private static final String ENCODING ="UTF-8";
- // 设置连接超时时间,单位毫秒
- private static final int CONNECT_TIMEOUT =6000 ;
- // 请求获取数据的超时时间(即响应时间),单位毫秒
- private static final int SOCKET_TIMEOUT = 6000 ;
- //用于封装HTTP请求头
- public static void packageHeader(Map<String, String> params,HttpRequestBase httpMethod){
- //封装请求头
- if(params!=null){
-
- //通过entryset()方法从params中返回所有键值对的集合,并保存在entryset中
- //通过foreach()方法每次取出一个键值对保存在一个entry中
- Set<Entry<String, String>> entrySet = params.entrySet();//alt+1选择第一个
- for (Entry<String, String> entry : entrySet) {//alt+/ 选择第一个
- //通过entry分别获键-值,将键-值参数设置到请求头HttpRequestBase对象中
- httpMethod.setHeader(entry.getKey(), entry.getValue());
- }
- }
-
- }
-
-
- //用于封装HTTP请求参数 ;ctrl+shift+o导入带http的、util的
- public static void packageParam(Map<String, String> params,HttpEntityEnclosingRequestBase httpMethod)
- throws UnsupportedEncodingException{
- if (params != null) {
- //NameValuePair是简单名称值对节点类型。多用于Java向url发送post请求,在发送post请求时用该list来存放参数
- List<NameValuePair> nvps = new ArrayList<NameValuePair>();
- //通过entrySet()方法从params中返回所有键值对的集合,并保存在entrySet中,通过foreach方法每次取出一个键值对保存在一个entry中。
- Set<Entry<String, String>> entrySet = params.entrySet();
- for(Entry<String,String> entry : entrySet){
- //分别提取entry中的key和value放入nvps数组中
- nvps.add(new BasicNameValuePair(entry.getKey(),
- entry.getValue()));
- }
- //设置到请求的http对象中,这里的ENCODING为之前创建的编码常量。
- httpMethod.setEntity(new UrlEncodedFormEntity(nvps,ENCODING));}}
- //用于获取HTTP响应内容
- public static HttpClientResp getHttpClientResult(CloseableHttpResponse httpResponse,
- CloseableHttpClient httpClient,HttpRequestBase httpMethod)
- throws Exception {
- //通过请求参数httpMethod执行http请求
- httpResponse =httpClient.execute(httpMethod);
- //获取http的响应结果
- if (httpResponse != null && httpResponse.getStatusLine() != null){
- String content = "";
- if (httpResponse.getEntity() != null){
- //将响应结果转化为String类型,并设置编码格式
- content= EntityUtils.toString(httpResponse.getEntity(),ENCODING);
- }
- //返回HttpClientResp实体类的对象,这两个参数分别代表实体类中的code属性和content属性,分别代表响应代码和响应内容
- return new HttpClientResp(httpResponse.getStatusLine().getStatusCode(),content);
- }
- //如果没有接收到响应内容则返回响应的错误信息
- return new HttpClientResp(HttpStatus.SC_INTERNAL_SERVER_ERROR);}
-
-
- //在工具类中定义doPost()方法通过HttpClient Post方式提交请求头和请求参数,从服务端返回状态码和json数据内容。
- public static HttpClientResp doPost(String url,Map<String,String>headers,Map<String,String> params)
- throws Exception {
- //创建HttpClient对象
- CloseableHttpClient httpClient= HttpClients.createDefault();
- //创建HttpPost对象
- HttpPost httpPost = new HttpPost(url);
- //setConnectTimeout:设置连接超时时间,单位毫秒
- //setConnectionRequestTimeout:设置从connet Manager(连接池)获取Connection超时时间,单位毫秒。
- //这个属性是新加的属性,因为目前版本是可以共享连接池的。
- //setSocketTimeout:请求获取数据的超时时间(即响应时间),单位是毫秒。
- //如果访问一个接口,多少时间内无法返回数据,就直接放弃此次调用。
- //封装请求配置项
- RequestConfig requestConfig = RequestConfig.custom()
- .setConnectTimeout(CONNECT_TIMEOUT)
- .setSocketTimeout(SOCKET_TIMEOUT).build();
- //设置post请求配置项
- httpPost.setConfig(requestConfig);
- //通过创建packageHeader()方法设置请求头
- packageHeader(headers, httpPost);
- //通过创建packageParam()方法设置请求参数
- packageParam(params, httpPost);
- //创建HttpResponse对象获取响应内容
- CloseableHttpResponse httpResponse = null;
- try {
- //执行请求并获得响应结果
- return getHttpClientResult(httpResponse, httpClient, httpPost);
- } finally {//释放资源
- release(httpResponse, httpClient);}
-
- }//alt+1
- //工具类中定义release ()方法用于用于释放httpclient(HTTP请求)对象资源和httpResponse(HTTP响应)对象资源
- private static void release(CloseableHttpResponse httpResponse, CloseableHttpClient httpClient)
- throws IOException {
- // TODO Auto-generated method stub
- if (httpResponse != null) {
- httpResponse.close();
- }
- if (httpClient != null) {
- httpClient.close();}}}

右键项目名称->run as->java application
运行后你会在你的Hadoop上看到数据如图
需要注意的是,在爬取数据的过程中,你需要将自己的hadoop启动起来,否则程序会报错,这样我们就获取到了数据,下回我们将对这些数据进行处理。
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。