webmagic框架实现java爬虫入门案例

论坛 期权论坛 脚本     
匿名技术用户   2021-1-14 13:58   35   0

我们要爬取的是:http://www.coupling.pw/fl/dy

这里的知识点先留着,下次有时间再整理

这里为了方便,用的maven项目

pom.xml如下:

 <dependency>
            <groupId>junit</groupId>
            <artifactId>junit</artifactId>
            <version>4.11</version>
            <scope>test</scope>
        </dependency>

        <dependency>
            <groupId>us.codecraft</groupId>
            <artifactId>webmagic-core</artifactId>
            <version>0.7.3</version>
        </dependency>
        <dependency>
            <groupId>us.codecraft</groupId>
            <artifactId>webmagic-extension</artifactId>
            <version>0.7.3</version>
        </dependency>



        <dependency>
            <groupId>org.seleniumhq.selenium</groupId>
            <artifactId>selenium-java</artifactId>
            <version>3.0.1</version>
        </dependency>
        <dependency>
            <groupId>org.seleniumhq.selenium</groupId>
            <artifactId>selenium-chrome-driver</artifactId>
            <version>3.0.1</version>
        </dependency>

        <dependency>
            <groupId>org.seleniumhq.selenium</groupId>
            <artifactId>selenium-remote-driver</artifactId>
            <version>3.0.1</version>
        </dependency>

        <dependency>
            <groupId>com.codeborne</groupId>
            <artifactId>phantomjsdriver</artifactId>
            <version>1.2.1</version>
        </dependency>

        <dependency>
            <groupId>org.apache.commons</groupId>
            <artifactId>commons-exec</artifactId>
            <version>1.3</version>
        </dependency>

        <!-- https://mvnrepository.com/artifact/mysql/mysql-connector-java -->
        <dependency>
            <groupId>mysql</groupId>
            <artifactId>mysql-connector-java</artifactId>
            <version>5.1.6</version>
        </dependency>

首先是config.properties

#数据库
db_url=jdbc:mysql://localhost:3306/ziyuan?useUnicode=true&characterEncoding=utf-8
db_username=root
db_password=962464
#表明
db_table=movie
#链接
dy=http://www.coupling.pw/fl/dy

pojo:

电影movie:

public class Movie {
    private String name;
    private String url;
    private String createtime;
    private String updatetime;
    //信息
    private String information;
    //介绍
    private String introduce;
    //人工更新
    private String manual;
    //图片
    private String image;
    //类型
    private String type;
    //标签
    private String label;

    @Override
    public String toString() {
        return "Movie{" +
                "name='" + name + '\'' +
                ", url='" + url + '\'' +
                ", createtime='" + createtime + '\'' +
                ", updatetime='" + updatetime + '\'' +
                ", information='" + information + '\'' +
                ", introduce='" + introduce + '\'' +
                ", manual='" + manual + '\'' +
                ", image='" + image + '\'' +
                ", type='" + type + '\'' +
                ", label='" + label + '\'' +
                '}';
    }

    public Movie(String name, String url, String createtime, String updatetime, String information, String introduce, String manual, String image, String type, String label) {
        this.name = name;
        this.url = url;
        this.createtime = createtime;
        this.updatetime = updatetime;
        this.information = information;
        this.introduce = introduce;
        this.manual = manual;
        this.image = image;
        this.type = type;
        this.label = label;
    }

    public Movie() {
    }

    public void setName(String name) {
        this.name = name;
    }

    public void setUrl(String url) {
        this.url = url;
    }

    public void setCreatetime(String createtime) {
        this.createtime = createtime;
    }

    public void setUpdatetime(String updatetime) {
        this.updatetime = updatetime;
    }

    public void setInformation(String information) {
        this.information = information;
    }

    public void setIntroduce(String introduce) {
        this.introduce = introduce;
    }

    public void setManual(String manual) {
        this.manual = manual;
    }

    public void setImage(String image) {
        this.image = image;
    }

    public void setType(String type) {
        this.type = type;
    }

    public void setLabel(String label) {
        this.label = label;
    }

    public String getName() {
        return name;
    }

    public String getUrl() {
        return url;
    }

    public String getCreatetime() {
        return createtime;
    }

    public String getUpdatetime() {
        return updatetime;
    }

    public String getInformation() {
        return information;
    }

    public String getIntroduce() {
        return introduce;
    }

    public String getManual() {
        return manual;
    }

    public String getImage() {
        return image;
    }

    public String getType() {
        return type;
    }

    public String getLabel() {
        return label;
    }
}

数据库DataSourceModel:

public class DataSourceModel {
    private String url;
    private String username;
    private String password;

    DataSourceModel(){

    }

    public String getUrl() {
        return url;
    }

    public void setUrl(String url) {
        this.url = url;
    }

    public String getUsername() {
        return username;
    }

    public void setUsername(String username) {
        this.username = username;
    }

    public String getPassword() {
        return password;
    }

    public void setPassword(String password) {
        this.password = password;
    }
}

然后是工具类:

public class Utils {

    //下载图片
    public static String downImage(String url){
        DataInputStream ds=null;
        OutputStream os=null;
        try {
            URL imageurl=new URL(url);
            ds=new DataInputStream(imageurl.openStream());
            String extName=url.substring(url.lastIndexOf("."));
            //创建图片名
            String picName= UUID.randomUUID().toString()+extName;
            //下载图片名
            os= new FileOutputStream(new File("C:\\Users\\hasee\\Desktop\\http\\"+picName));
            byte[] buffer = new byte[1024];
            int length;
            while ((length = ds.read(buffer)) > 0) {
                os.write(buffer, 0, length);
            }
            return picName;
        } catch (Exception e) {
            e.printStackTrace();
        }finally {
            try {
                if(ds!=null) {
                    ds.close();
                }
                if(os!=null) {
                    os.close();
                }
            } catch (IOException e) {
                e.printStackTrace();
            }
        }
        return "";
    }

    //加载配置文件
    public static Properties loadConfig(String configFile) {
        InputStream input = null;
        Properties properties = new Properties();
        try {
            input = Utils.class.getResourceAsStream(configFile);
            properties.load(input);
        } catch (Exception e) {
            System.out.println("配置文件加载失败");
        } finally {
            if(input != null) {
                try {
                    input.close();
                } catch (IOException e) {
                    e.printStackTrace();
                }
            }

        }
        return properties;
    }

    //jdbc连接数据库
    public static Connection getConnection(DataSourceModel dataSourceModel){
        Connection conn=null;
        try {
            Class.forName("com.mysql.jdbc.Driver");
            conn= DriverManager.getConnection(dataSourceModel.getUrl(), dataSourceModel.getUsername(), dataSourceModel.getPassword());
        } catch (ClassNotFoundException e) {
            e.printStackTrace();
        } catch (SQLException e) {
            e.printStackTrace();
        }
        return conn;
    }


    //保存
    public static void saveDb(Connection connection,String sql){
        Statement statement=null;
        try {
            if(connection!=null){
                statement=connection.createStatement();
                statement.executeUpdate(sql);
            }
        } catch (Exception e) {
            e.printStackTrace();
        } finally {
            try {
                if(statement!=null)
                    statement.close();
            } catch (SQLException e) {
                e.printStackTrace();
            }
        }
    }

    //查询数据库是否存在(如果存在就要更新)
    public static int excuteCountQuery(Connection connection,String sql){
        int rowCount=0;
        Statement statement=null;
        ResultSet resultSet=null;
        try {
            statement=connection.createStatement();
            resultSet=statement.executeQuery(sql);
            while(resultSet.next()){
                rowCount = resultSet.getInt("totalnum");
            }

        } catch (Exception e) {
            e.printStackTrace();
        } finally {
            try {
                if(resultSet!=null) {
                    resultSet.close();
                }
                if(statement!=null) {
                    statement.close();
                }
            } catch (SQLException e) {
                e.printStackTrace();
            }

        }
        return rowCount;

    }


}

然后就是MyProcessor分析页面,

public class MyProcessor implements PageProcessor {

    static Properties properties;
    static DataSourceModel dataSourceModel;
    static String table;


    static String dy;

    static {
        properties = Utils.loadConfig("/config.properties");
        dataSourceModel = new DataSourceModel();
        dataSourceModel.setUrl(properties.getProperty("db_url"));
        dataSourceModel.setUsername(properties.getProperty("db_username"));
        dataSourceModel.setPassword(properties.getProperty("db_password"));

        table = properties.getProperty("db_table");


        dy = properties.getProperty("dy");

    }


    private Site site=Site.me()
            .setCharset("utf8")//设置编码
            .setTimeOut(5000)//设置超时时间
            .setRetrySleepTime(5000)//重试间隔时间
            .setSleepTime(3);//设置重试次数


    public MyProcessor() {
    }

    @Override
    public Site getSite() {
        return site;
    }


    /**
     * 爬取页面的url
     * @param page:链接得到的页面
     */
    @Override
    public void process(Page page) {
        try {
            Thread.sleep(5000);
        } catch (InterruptedException e) {
            e.printStackTrace();
        }
        //得到当前页面每个链接
         List<Selectable> list=page.getHtml().css("article a.entry-thumb.entry-cover").nodes();
         //如果list为空,表示该链接是进入的页面
         if(list==null||list.size()==0){
             try {
                 Movie movie=ananyDetail(page);
                 page.putField("movie",movie);

             } catch (Exception e) {
                 e.printStackTrace();
             }
         }else{
             //爬取url放入队列
             for (Selectable selectable:list){
                 //toString()等同于get(0)
                 String url=selectable.links().toString();
                 //放入ResultItems队列,然后在MyPipelines里取出来进行保存处理
                 page.addTargetRequest(url);
             }
             //下一页的链接放入队列里待爬取
             Selectable nextSelectable=page.getHtml().css("nav.pagination a.next").nodes().get(0);
             if(nextSelectable!=null){
                 String nextUrl=nextSelectable.links().toString();
                 System.out.println("下一页:"+nextUrl);
                 page.addTargetRequest(nextUrl);
             }

         }

    }

    /**
     * 页面分析
     * @param page
     * @throws Exception
     */
    public static Movie ananyDetail(Page page) throws Exception {
        Thread.sleep(1000);//1s
        //由于我的数据没爬完,防止数据为空保存时报错,所以就这样初始化哈~~~
        Movie movie=new Movie("","","","","","","","","","");
        Html html=page.getHtml();
        String text1=html.css("header.entry-header h1","text").nodes().get(0).toString();
        //[超时空心跳][2019][爱情 / 中国]
        String text2=text1.
                trim().//去空格
                replace("][","/");//替换
        String[] str=text2.substring(1,text2.length()-1).//删除第一个字符和最后一个字符
                split("/");
        String label="";
        if(str!=null&&str.length>1) {
            for (int i=0;i<str.length;i++){
                if(i==0){
                    //保存电影名
                    movie.setName(str[i]);
                }else{
                    //添加标签
                    label=label+"/"+str[i];
                }
            }
        }
        //保存标签
        movie.setLabel(label);
        //保存信息
        String information=html.css("div.entry-content p","text").nodes().get(0).toString();
        //毕竟只是一个demo,information的长度可能超过数据库长度而报错,所以暂且这样判断一下
        if(information.length()<250) {
            movie.setInformation(information);
        }
        String image=html.css("div.entry-content p img","src").toString();
        //下载图片
        String imageName=Utils.downImage(image);
        //保存图片
        movie.setImage(imageName);
        //还有其他信息就不爬了
        return  movie;
    }


    public static void main(String[] args){
                Spider spider1=Spider.create(new MyProcessor());
                spider1.addUrl(dy)
                        .addPipeline(new MyPipelines(dataSourceModel,table))//用MyPipelines类处理结果保存在数据库
                        .thread(3)//开启3个线程
                        .run();

            }

    }

然后是MyPipelines对爬去到的数据进行保存

public class MyPipelines implements Pipeline {
    private DataSourceModel dataSourceModel;
    private String table;

    public MyPipelines() {
    }

    public MyPipelines(DataSourceModel dataSourceModel, String table) {
        this.dataSourceModel = dataSourceModel;
        this.table = table;
    }

    @Override
    public void process(ResultItems resultItems, Task task) {
        Movie movie=resultItems.get("movie");
        if(movie!=null) {
            System.out.println(movie.toString());
            saveDb(movie);
        }
    }

    public  void saveDb(Movie movie){
        Connection connection=null;
        try {
            //入数据库
            connection=Utils.getConnection(dataSourceModel);
            //先查询是否存在
            SimpleDateFormat sdf=new SimpleDateFormat("yyyy-MM-dd");
            String querySql="select count(1) as totalnum from "+table+" where name='#name'";
            querySql=querySql.replace("#name",movie.getName());
            int count=Utils.excuteCountQuery(connection,querySql);
            if(count<=0){
                //插入
                String sql="insert into "+table+" (name,url,createtime,information,introduce,manual,image,type,label) values ('#name','#url','#createtime','#information','#introduce','#manual','#image','#type','#label')";

                sql=sql.replace("#name",movie.getName())
                        .replace("#url",movie.getUrl())
                        .replace("#createtime",sdf.format(new Date()))
                        .replace("#information",movie.getInformation())
                        .replace("#introduce",movie.getIntroduce())
                        .replace("#manual",movie.getManual())
                        .replace("#image",movie.getImage())
                        .replace("#type",movie.getType())
                        .replace("#label",movie.getLabel());
                Utils.saveDb(connection,sql);
            }else{
                //更新
                String updateSql="update "+table+" set url='#url',updatetime='#updatetime' where name='#name' and manual='0'";

                updateSql=updateSql.replace("#name",movie.getName())
                        .replace("#url",movie.getUrl())
                        .replace("#updatetime",sdf.format(new Date()));
                Utils.saveDb(connection,updateSql);
            }
        } catch (Exception e) {
            System.out.println("入库失败");
            e.printStackTrace();
        }finally {
            if(connection!=null){
                try {
                    connection.close();
                } catch (SQLException e) {
                    e.printStackTrace();
                }
            }

        }
    }
}

这只是一个例子,代码已经注解的很细了,至于知识点就在其他地方看吧,等有时间了再回来改一下。

分享到 :
0 人收藏
您需要登录后才可以回帖 登录 | 立即注册

本版积分规则

积分:7942463
帖子:1588486
精华:0
期权论坛 期权论坛
发布
内容

下载期权论坛手机APP