java使用webmagic爬取网页内容的基本环境准备

作者: admin 分类: Scrapy 发布时间: 2020-05-11 10:14  阅读: 181 views

谷歌和火狐的驱动程序下载,可以浏览器程序的版本相符,否则会出现一些奇怪的问题。

http://npm.taobao.org/mirrors/chromedriver/
https://github.com/mozilla/geckodriver/releases

这里使用的版本为

谷歌浏览器版本 google-chrome-stable_current_63.0.3239.84_x86_64.rpm

webdriver版本   chromedriver 2.33

火狐浏览器版本 firefox-58.0.1.tar.bz2

webdriver版本  geckodriver 0.19.1
<!-- webmagic框架 -->
<dependency>
    <groupId>us.codecraft</groupId>
    <artifactId>webmagic-core</artifactId>
    <version>0.6.1</version>
    <exclusions>
        <exclusion>
            <groupId>org.slf4j</groupId>
            <artifactId>slf4j-log4j12</artifactId>
        </exclusion>
        <exclusion>
            <groupId>com.alibaba</groupId>
            <artifactId>fastjson</artifactId>
        </exclusion>
    </exclusions>
</dependency>
<dependency>
    <groupId>us.codecraft</groupId>
    <artifactId>webmagic-extension</artifactId>
    <version>0.6.1</version>
</dependency>

<!-- selenium处理 -->
<dependency>
    <groupId>org.seleniumhq.selenium</groupId>
    <artifactId>selenium-java</artifactId>
    <version>3.9.1</version>
</dependency>
<dependency>
    <groupId>org.seleniumhq.selenium</groupId>
    <artifactId>selenium-server</artifactId>
    <version>3.9.1</version>
    <scope>test</scope>
    <exclusions>
        <exclusion>
            <groupId>org.yaml</groupId>
            <artifactId>snakeyaml</artifactId>
        </exclusion>
        <exclusion>
            <groupId>org.apache.httpcomponents</groupId>
            <artifactId>httpmime</artifactId>
        </exclusion>
    </exclusions>
</dependency>

<!-- https://mvnrepository.com/artifact/org.apache.httpcomponents/httpmime -->
<!-- 替代selenium-server中的 -->
<dependency>
    <groupId>org.apache.httpcomponents</groupId>
    <artifactId>httpmime</artifactId>
    <version>4.5.10</version>
</dependency>

这里是简单获取一个网页内容的解析

package com.chl.webmagic.processor;

import java.util.ArrayList;
import java.util.List;
import java.util.Random;

import org.openqa.selenium.By;
import org.openqa.selenium.WebDriver;
import org.openqa.selenium.WebElement;
import org.openqa.selenium.chrome.ChromeDriver;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.alibaba.fastjson.JSON;
import com.chl.base.ApplicationContextBean;
import com.chl.base.BaseUtil;
import com.chl.entity.trans.Law;
import com.chl.entity.trys.Lawselect;
import com.chl.entity.trys.Lawspider;
import com.chl.service.trys.LawSelectService;
import com.chl.service.trys.LawSpiderService;

import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.processor.PageProcessor;

/**
 * 爬取一个网页的内容
 * @author chenhailong
 *
 */
public class LawInnerTotalProcessor implements PageProcessor{

    Logger logger = LoggerFactory.getLogger(LawInnerTotalProcessor.class);
    // 抓取网站的相关配置,包括:编码、抓取间隔、重试次数等
    private Site site = Site.me().setRetryTimes(3).setSleepTime(1000);
    public static String baseUrl = "xxx";

    private static LawSelectService lss;

    private static LawSpiderService ls;

    static
    {
         //设置驱动程序的位置
        System.getProperties().setProperty("webdriver.chrome.driver", "/Users/chenhailong/Downloads/tools/nessarytool/chromedriver");

        lss = ApplicationContextBean.getBean("lss");
        ls = ApplicationContextBean.getBean("ls");
    }

    @Override
    public void process(Page page) {
         Law law = new Law();
         Integer spiderId = 0;
         boolean bool = true;
         boolean isTotal = true;
         List<String> each = new ArrayList<String>(); //将每次的结果存放到对象中,统一传递给pipeline

         WebDriver w = new ChromeDriver();
         List<Lawselect> ls = lss.selectList(Lawselect.type_0);
         for(int ii = 0;ii< ls.size();ii++){

             law.setOnLine(true);
             logger.info("当前所设置的处理环境为!!!!!{},处理第{}个",(law.isOnLine()?"正式":"测试"), ii);
             if(ii > 20) { break;} //每次处理10个,防止撑爆

             Lawselect lawselect = ls.get(ii);
             isTotal = true;
             Integer id = lawselect.getId();
             logger.info("处理当前law信息{}",lawselect.toString());
             law.setOrganize(lawselect.getName()); //源标题, 帖子标签处理时用
             try
             {
                 //非详情页 -> 处理列表汇总
                 w.get(baseUrl + "&t="+System.currentTimeMillis());
                 StringBuffer context = new StringBuffer();

                 String lawName = lawselect.getName(); //名称处理
                 law.setTitle(lawName);
                 law.setOriginTitle(lawName);

                 //输入框、模拟输入搜索词
                 WebElement search = w.findElement(By.cssSelector(".el-input__inner"));
                 for(int i = 0;i<lawName.length();i++) {
                     Thread.sleep( new Random().nextInt(10) * 100 );
                     search.sendKeys(lawName.substring(i, i+1));
                 }
                 //点击搜索按钮
                 w.findElement(By.cssSelector(".el-button.search-btn.el-button--primary")).click();

                 Thread.sleep(2 * 1000);
                 //获取面板左侧内容
                 if(BaseUtil.doesWebElementExist(w, By.cssSelector(".pagenotfound-text"))){
                     logger.info("没有根据当前名称{},找到相关信息", lawName);
                     bool = false;
                     update(lawselect,Lawselect.type_3); //修改为未找到状态
                 }

                 if(bool) {
                     //分析当前主题的 页数
                     WebElement pager = w.findElement(By.cssSelector(".el-pager"));
                     List<WebElement> pageList = pager.findElements(By.xpath("li"));
                     int pageNums = pageList.size();

                     logger.info("根据当前名称{},获取到的页数为{}!", lawName , pageNums);
                     WebElement rightContent = w.findElement(By.cssSelector(".result-inner-wrapper"));

                     List<WebElement> shortList = rightContent.findElements(By.xpath("div"));
                     int count = shortList.size(); //当前的信息条数,用来判断是否需要进行汇总页的处理
                     ArrayList<String> more = new ArrayList<String>();
                     String finalTitle = "关于《"+lawName+"》的内容及";
                     for(int pages = 1;pages<pageNums+1;pages++) {
                         if(pages>1) {
                             break;
                         }
                         for(int i = 0;i< count;i++) {
                             WebElement srt = shortList.get(i);
                             String titleLeft = srt.findElement(By.xpath("div/a")).getText();
                             String titleRight = srt.findElement(By.xpath("div/span")).getText();
                             String innerTitle = titleLeft + "_" + titleRight;
                             String innerType  = srt.findElement(By.xpath("div[2]/div/span[1]")).getText();
                             String innerOrgan = srt.findElement(By.xpath("div[2]/div/span[2]")).getText();
                             String innerPub   = srt.findElement(By.xpath("div[2]/div/span[3]")).getText();
                             String innerImp   = srt.findElement(By.xpath("div[2]/div/span[4]")).getText();
                             String innerState = srt.findElement(By.xpath("div[1]/span")).getText();

                             //如果属于当前类型,进行记录
                             if(innerType.equals(Law.baseName)) {
                                 //如果索引位置在左边,是相关信息,否则为附加信息
                                 if(innerTitle.indexOf(lawName)==0) {
                                     //如果修正按钮出现
                                     WebElement cGap= srt.findElement(By.cssSelector(".c-gap-top"));
                                     if(BaseUtil.doesWebElementExist(cGap,By.xpath("a"))){
                                         List<WebElement> links = cGap.findElements(By.xpath("a"));
                                         WebElement selector = links.get(0);
                                         String buttonText = selector.getText();
                                         if(buttonText.indexOf("已被修订数") >= 0 || buttonText.indexOf("修订历史") >= 0) {
                                             logger.info("存在历史修订数据!");
                                             more.add(selector.getAttribute("href"));
                                             String num = buttonText.substring(buttonText.indexOf("(") + 1, buttonText.indexOf(")"));
                                             finalTitle = finalTitle + "被修订"+ num + "次的记录";
                                         }else {
                                             isTotal = false;
                                         }
                                     }else {
                                         isTotal = false;
                                     }
                                     context.append("\t\n\n");
                                     context.append("[b]" +titleLeft + "[/b]");
                                     context.append(" ");
                                     if(titleRight.indexOf("有效")>=0) {
                                         context.append("[color=#3CB371]"+titleRight+"[/color]");
                                     }else {
                                         context.append("[color=#FA8072]"+titleRight+"[/color]");
                                     }
                                     context.append("\t\n");
                                     context.append("发布日期:" + innerPub);
                                     context.append("\t\n");
                                     context.append("实施日期:" + innerImp);
                                     context.append("\t\n");
                                     context.append("发布机关:" + innerOrgan);
                                     context.append("\t\n");
                                     context.append("效力级别:" + innerType);
                                     context.append("\t\n");
                                     context.append("时效 性:" + innerState);
                                     context.append("\t\n");
                                     context.append("\t\n");


                                     insert(id, innerTitle, srt.findElement(By.xpath("div[1]/a")).getAttribute("href"), Lawspider.type_2);
                                 }else {
                                     insert(id, "exc_"+innerTitle, srt.findElement(By.xpath("div[1]/a")).getAttribute("href"), Lawspider.type_3);
                                 }
                             }
                         }
                     }

                     for(String s : more) {
                         w.get(s);

                         Thread.sleep(1 * 1000);

                        //分析当前主题的 页数
                         WebElement pagerFix = w.findElement(By.cssSelector(".el-pager"));
                         List<WebElement> fixPageList = pagerFix.findElements(By.xpath("li"));

                         int fixPageNums = fixPageList.size();
                         logger.info("根据当前名称{}查找修订详情,获取到的页数为{}!", lawName , fixPageNums);
                         WebElement fixRightContent = w.findElement(By.cssSelector(".result-inner-wrapper"));
                         List<WebElement> fixShortList = fixRightContent.findElements(By.xpath("div"));
                         int fixCount = fixShortList.size();
                         for(int pages = 1;pages<fixPageNums+1;pages++) {
                             for(int i = 0;i< fixCount;i++) {
                                 WebElement srt = fixShortList.get(i);
                                 String titleLeft = srt.findElement(By.xpath("div/a")).getText();
                                 String titleRight = srt.findElement(By.xpath("div/span")).getText();
                                 String innerTitle = titleLeft + "_" + titleRight;
                                 String innerType  = srt.findElement(By.xpath("div[2]/div/span[1]")).getText();
                                 String innerOrgan = srt.findElement(By.xpath("div[2]/div/span[2]")).getText();
                                 String innerPub   = srt.findElement(By.xpath("div[2]/div/span[3]")).getText();
                                 String innerImp   = srt.findElement(By.xpath("div[2]/div/span[4]")).getText();
                                 String innerState = srt.findElement(By.xpath("div[1]/span")).getText();
                                 //如果属于当前类型,进行记录
                                 if(innerType.equals(Law.baseName)) {
                                     context.append("[b]" +titleLeft + "[/b]");
                                     context.append(" ");
                                     if(titleRight.indexOf("有效")>=0) {
                                         context.append("[color=#3CB371]"+titleRight+"[/color]");
                                     }else {
                                         context.append("[color=#FA8072]"+titleRight+"[/color]");
                                     }
                                     context.append("\t\n");
                                     context.append("发布日期:" + innerPub);
                                     context.append("\t\n");
                                     context.append("实施日期:" + innerImp);
                                     context.append("\t\n");
                                     context.append("发布机关:" + innerOrgan);
                                     context.append("\t\n");
                                     context.append("效力级别:" + innerType);
                                     context.append("\t\n");
                                     context.append("时效 性:" + innerState);
                                     context.append("\t\n");
                                     context.append("\t\n");
                                     insert(id, innerTitle, srt.findElement(By.xpath("div[1]/a")).getAttribute("href"), Lawspider.type_2);
                                 }
                             }
                             //大于1页才进行翻页
                             if(fixPageNums > 1 && pages < fixPageNums) {
                                 //翻页处理,并等待1s
                                 pagerFix = w.findElement(By.cssSelector(".el-pager"));
                                 pagerFix.findElement(By.xpath("li["+(pages+1)+"]")).click();
                                 Thread.sleep(2 * 1000);

                                 fixRightContent = w.findElement(By.cssSelector(".result-inner-wrapper"));
                                 fixShortList = fixRightContent.findElements(By.xpath("div"));
                                 fixCount = fixShortList.size();
                             }
                         }
                     }
                     if(isTotal) {

                         context.append("\t\n\n");
                         context.append("查找更多法律,请查看");
                         context.append("[/url]");
                         context.append("\t\n");
                         context.append("本站资料均收集于网络,如有侵权,请联系删除。");


                         logger.info("进来一次,law的唯一标记为:" + id);
                         spiderId = insert(id, finalTitle, w.getCurrentUrl(), Lawspider.type_1);
                        //需要有汇总的处理,插入到库中
                         law.setContext(context.toString());
                         law.setType(Law.type_1);
                         law.setFinalTitle(finalTitle);
                         law.setPid(spiderId);
                         each.add(JSON.toJSONString(law));
                     }

                     update(lawselect,Lawselect.type_2); //修改为处理中状态
                 }
             }
             catch(Exception ex)
             {
                 logger.error("error message {}", ex);
                 update(lawselect,Lawselect.type_4); //修改为异常状态,待重新处理
                 break;
             }
        }
        page.putField("res",each);
        w.quit();
    }


    @Override
    public Site getSite() {
        return site;
    }

    /**
     * 修改第一层的状态信息
     * @param type
     */
    private void update(Lawselect ls,int type) {
        ls.setType((byte)type);
        lss.updateLawSelect(ls);
    }

    /**
     * 插入详细表,返回主键id
     * @param pid
     * @param title
     * @param url
     * @return
     */
    private int insert(int pid,String title,String url,int type) {
        Lawspider lawspider = new Lawspider();
        lawspider.setPid(pid);
        lawspider.setTitle(title);
        lawspider.setUrl(url);
        lawspider.setStatus(Lawspider.status_1);
        lawspider.setType(type);
        ls.insertSpider(lawspider);
        return lawspider.getId();
    }

}

可以做一些简单的练习,入门还是比较快的。


   原创文章,转载请标明本文链接: java使用webmagic爬取网页内容的基本环境准备

如果觉得我的文章对您有用,请随意打赏。您的支持将鼓励我继续创作!

发表评论

电子邮件地址不会被公开。 必填项已用*标注

更多阅读