java使用webmagic爬取网页内容的基本环境准备
谷歌和火狐的驱动程序下载,可以浏览器程序的版本相符,否则会出现一些奇怪的问题。
http://npm.taobao.org/mirrors/chromedriver/
https://github.com/mozilla/geckodriver/releases
这里使用的版本为
谷歌浏览器版本 google-chrome-stable_current_63.0.3239.84_x86_64.rpm
webdriver版本 chromedriver 2.33
火狐浏览器版本 firefox-58.0.1.tar.bz2
webdriver版本 geckodriver 0.19.1
<!-- webmagic框架 -->
<dependency>
<groupId>us.codecraft</groupId>
<artifactId>webmagic-core</artifactId>
<version>0.6.1</version>
<exclusions>
<exclusion>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-log4j12</artifactId>
</exclusion>
<exclusion>
<groupId>com.alibaba</groupId>
<artifactId>fastjson</artifactId>
</exclusion>
</exclusions>
</dependency>
<dependency>
<groupId>us.codecraft</groupId>
<artifactId>webmagic-extension</artifactId>
<version>0.6.1</version>
</dependency>
<!-- selenium处理 -->
<dependency>
<groupId>org.seleniumhq.selenium</groupId>
<artifactId>selenium-java</artifactId>
<version>3.9.1</version>
</dependency>
<dependency>
<groupId>org.seleniumhq.selenium</groupId>
<artifactId>selenium-server</artifactId>
<version>3.9.1</version>
<scope>test</scope>
<exclusions>
<exclusion>
<groupId>org.yaml</groupId>
<artifactId>snakeyaml</artifactId>
</exclusion>
<exclusion>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpmime</artifactId>
</exclusion>
</exclusions>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.httpcomponents/httpmime -->
<!-- 替代selenium-server中的 -->
<dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpmime</artifactId>
<version>4.5.10</version>
</dependency>
这里是简单获取一个网页内容的解析
package com.chl.webmagic.processor;
import java.util.ArrayList;
import java.util.List;
import java.util.Random;
import org.openqa.selenium.By;
import org.openqa.selenium.WebDriver;
import org.openqa.selenium.WebElement;
import org.openqa.selenium.chrome.ChromeDriver;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.alibaba.fastjson.JSON;
import com.chl.base.ApplicationContextBean;
import com.chl.base.BaseUtil;
import com.chl.entity.trans.Law;
import com.chl.entity.trys.Lawselect;
import com.chl.entity.trys.Lawspider;
import com.chl.service.trys.LawSelectService;
import com.chl.service.trys.LawSpiderService;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.processor.PageProcessor;
/**
* 爬取一个网页的内容
* @author chenhailong
*
*/
public class LawInnerTotalProcessor implements PageProcessor{
Logger logger = LoggerFactory.getLogger(LawInnerTotalProcessor.class);
// 抓取网站的相关配置,包括:编码、抓取间隔、重试次数等
private Site site = Site.me().setRetryTimes(3).setSleepTime(1000);
public static String baseUrl = "xxx";
private static LawSelectService lss;
private static LawSpiderService ls;
static
{
//设置驱动程序的位置
System.getProperties().setProperty("webdriver.chrome.driver", "/Users/chenhailong/Downloads/tools/nessarytool/chromedriver");
lss = ApplicationContextBean.getBean("lss");
ls = ApplicationContextBean.getBean("ls");
}
@Override
public void process(Page page) {
Law law = new Law();
Integer spiderId = 0;
boolean bool = true;
boolean isTotal = true;
List<String> each = new ArrayList<String>(); //将每次的结果存放到对象中,统一传递给pipeline
WebDriver w = new ChromeDriver();
List<Lawselect> ls = lss.selectList(Lawselect.type_0);
for(int ii = 0;ii< ls.size();ii++){
law.setOnLine(true);
logger.info("当前所设置的处理环境为!!!!!{},处理第{}个",(law.isOnLine()?"正式":"测试"), ii);
if(ii > 20) { break;} //每次处理10个,防止撑爆
Lawselect lawselect = ls.get(ii);
isTotal = true;
Integer id = lawselect.getId();
logger.info("处理当前law信息{}",lawselect.toString());
law.setOrganize(lawselect.getName()); //源标题, 帖子标签处理时用
try
{
//非详情页 -> 处理列表汇总
w.get(baseUrl + "&t="+System.currentTimeMillis());
StringBuffer context = new StringBuffer();
String lawName = lawselect.getName(); //名称处理
law.setTitle(lawName);
law.setOriginTitle(lawName);
//输入框、模拟输入搜索词
WebElement search = w.findElement(By.cssSelector(".el-input__inner"));
for(int i = 0;i<lawName.length();i++) {
Thread.sleep( new Random().nextInt(10) * 100 );
search.sendKeys(lawName.substring(i, i+1));
}
//点击搜索按钮
w.findElement(By.cssSelector(".el-button.search-btn.el-button--primary")).click();
Thread.sleep(2 * 1000);
//获取面板左侧内容
if(BaseUtil.doesWebElementExist(w, By.cssSelector(".pagenotfound-text"))){
logger.info("没有根据当前名称{},找到相关信息", lawName);
bool = false;
update(lawselect,Lawselect.type_3); //修改为未找到状态
}
if(bool) {
//分析当前主题的 页数
WebElement pager = w.findElement(By.cssSelector(".el-pager"));
List<WebElement> pageList = pager.findElements(By.xpath("li"));
int pageNums = pageList.size();
logger.info("根据当前名称{},获取到的页数为{}!", lawName , pageNums);
WebElement rightContent = w.findElement(By.cssSelector(".result-inner-wrapper"));
List<WebElement> shortList = rightContent.findElements(By.xpath("div"));
int count = shortList.size(); //当前的信息条数,用来判断是否需要进行汇总页的处理
ArrayList<String> more = new ArrayList<String>();
String finalTitle = "关于《"+lawName+"》的内容及";
for(int pages = 1;pages<pageNums+1;pages++) {
if(pages>1) {
break;
}
for(int i = 0;i< count;i++) {
WebElement srt = shortList.get(i);
String titleLeft = srt.findElement(By.xpath("div/a")).getText();
String titleRight = srt.findElement(By.xpath("div/span")).getText();
String innerTitle = titleLeft + "_" + titleRight;
String innerType = srt.findElement(By.xpath("div[2]/div/span[1]")).getText();
String innerOrgan = srt.findElement(By.xpath("div[2]/div/span[2]")).getText();
String innerPub = srt.findElement(By.xpath("div[2]/div/span[3]")).getText();
String innerImp = srt.findElement(By.xpath("div[2]/div/span[4]")).getText();
String innerState = srt.findElement(By.xpath("div[1]/span")).getText();
//如果属于当前类型,进行记录
if(innerType.equals(Law.baseName)) {
//如果索引位置在左边,是相关信息,否则为附加信息
if(innerTitle.indexOf(lawName)==0) {
//如果修正按钮出现
WebElement cGap= srt.findElement(By.cssSelector(".c-gap-top"));
if(BaseUtil.doesWebElementExist(cGap,By.xpath("a"))){
List<WebElement> links = cGap.findElements(By.xpath("a"));
WebElement selector = links.get(0);
String buttonText = selector.getText();
if(buttonText.indexOf("已被修订数") >= 0 || buttonText.indexOf("修订历史") >= 0) {
logger.info("存在历史修订数据!");
more.add(selector.getAttribute("href"));
String num = buttonText.substring(buttonText.indexOf("(") + 1, buttonText.indexOf(")"));
finalTitle = finalTitle + "被修订"+ num + "次的记录";
}else {
isTotal = false;
}
}else {
isTotal = false;
}
context.append("\t\n\n");
context.append("[b]" +titleLeft + "[/b]");
context.append(" ");
if(titleRight.indexOf("有效")>=0) {
context.append("[color=#3CB371]"+titleRight+"[/color]");
}else {
context.append("[color=#FA8072]"+titleRight+"[/color]");
}
context.append("\t\n");
context.append("发布日期:" + innerPub);
context.append("\t\n");
context.append("实施日期:" + innerImp);
context.append("\t\n");
context.append("发布机关:" + innerOrgan);
context.append("\t\n");
context.append("效力级别:" + innerType);
context.append("\t\n");
context.append("时效 性:" + innerState);
context.append("\t\n");
context.append("\t\n");
insert(id, innerTitle, srt.findElement(By.xpath("div[1]/a")).getAttribute("href"), Lawspider.type_2);
}else {
insert(id, "exc_"+innerTitle, srt.findElement(By.xpath("div[1]/a")).getAttribute("href"), Lawspider.type_3);
}
}
}
}
for(String s : more) {
w.get(s);
Thread.sleep(1 * 1000);
//分析当前主题的 页数
WebElement pagerFix = w.findElement(By.cssSelector(".el-pager"));
List<WebElement> fixPageList = pagerFix.findElements(By.xpath("li"));
int fixPageNums = fixPageList.size();
logger.info("根据当前名称{}查找修订详情,获取到的页数为{}!", lawName , fixPageNums);
WebElement fixRightContent = w.findElement(By.cssSelector(".result-inner-wrapper"));
List<WebElement> fixShortList = fixRightContent.findElements(By.xpath("div"));
int fixCount = fixShortList.size();
for(int pages = 1;pages<fixPageNums+1;pages++) {
for(int i = 0;i< fixCount;i++) {
WebElement srt = fixShortList.get(i);
String titleLeft = srt.findElement(By.xpath("div/a")).getText();
String titleRight = srt.findElement(By.xpath("div/span")).getText();
String innerTitle = titleLeft + "_" + titleRight;
String innerType = srt.findElement(By.xpath("div[2]/div/span[1]")).getText();
String innerOrgan = srt.findElement(By.xpath("div[2]/div/span[2]")).getText();
String innerPub = srt.findElement(By.xpath("div[2]/div/span[3]")).getText();
String innerImp = srt.findElement(By.xpath("div[2]/div/span[4]")).getText();
String innerState = srt.findElement(By.xpath("div[1]/span")).getText();
//如果属于当前类型,进行记录
if(innerType.equals(Law.baseName)) {
context.append("[b]" +titleLeft + "[/b]");
context.append(" ");
if(titleRight.indexOf("有效")>=0) {
context.append("[color=#3CB371]"+titleRight+"[/color]");
}else {
context.append("[color=#FA8072]"+titleRight+"[/color]");
}
context.append("\t\n");
context.append("发布日期:" + innerPub);
context.append("\t\n");
context.append("实施日期:" + innerImp);
context.append("\t\n");
context.append("发布机关:" + innerOrgan);
context.append("\t\n");
context.append("效力级别:" + innerType);
context.append("\t\n");
context.append("时效 性:" + innerState);
context.append("\t\n");
context.append("\t\n");
insert(id, innerTitle, srt.findElement(By.xpath("div[1]/a")).getAttribute("href"), Lawspider.type_2);
}
}
//大于1页才进行翻页
if(fixPageNums > 1 && pages < fixPageNums) {
//翻页处理,并等待1s
pagerFix = w.findElement(By.cssSelector(".el-pager"));
pagerFix.findElement(By.xpath("li["+(pages+1)+"]")).click();
Thread.sleep(2 * 1000);
fixRightContent = w.findElement(By.cssSelector(".result-inner-wrapper"));
fixShortList = fixRightContent.findElements(By.xpath("div"));
fixCount = fixShortList.size();
}
}
}
if(isTotal) {
context.append("\t\n\n");
context.append("查找更多法律,请查看");
context.append("[/url]");
context.append("\t\n");
context.append("本站资料均收集于网络,如有侵权,请联系删除。");
logger.info("进来一次,law的唯一标记为:" + id);
spiderId = insert(id, finalTitle, w.getCurrentUrl(), Lawspider.type_1);
//需要有汇总的处理,插入到库中
law.setContext(context.toString());
law.setType(Law.type_1);
law.setFinalTitle(finalTitle);
law.setPid(spiderId);
each.add(JSON.toJSONString(law));
}
update(lawselect,Lawselect.type_2); //修改为处理中状态
}
}
catch(Exception ex)
{
logger.error("error message {}", ex);
update(lawselect,Lawselect.type_4); //修改为异常状态,待重新处理
break;
}
}
page.putField("res",each);
w.quit();
}
@Override
public Site getSite() {
return site;
}
/**
* 修改第一层的状态信息
* @param type
*/
private void update(Lawselect ls,int type) {
ls.setType((byte)type);
lss.updateLawSelect(ls);
}
/**
* 插入详细表,返回主键id
* @param pid
* @param title
* @param url
* @return
*/
private int insert(int pid,String title,String url,int type) {
Lawspider lawspider = new Lawspider();
lawspider.setPid(pid);
lawspider.setTitle(title);
lawspider.setUrl(url);
lawspider.setStatus(Lawspider.status_1);
lawspider.setType(type);
ls.insertSpider(lawspider);
return lawspider.getId();
}
}
可以做一些简单的练习,入门还是比较快的。
原创文章,转载请标明本文链接: java使用webmagic爬取网页内容的基本环境准备