|
我们要爬取的是:http://www.coupling.pw/fl/dy
这里的知识点先留着,下次有时间再整理

这里为了方便,用的maven项目

pom.xml如下:
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>4.11</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>us.codecraft</groupId>
<artifactId>webmagic-core</artifactId>
<version>0.7.3</version>
</dependency>
<dependency>
<groupId>us.codecraft</groupId>
<artifactId>webmagic-extension</artifactId>
<version>0.7.3</version>
</dependency>
<dependency>
<groupId>org.seleniumhq.selenium</groupId>
<artifactId>selenium-java</artifactId>
<version>3.0.1</version>
</dependency>
<dependency>
<groupId>org.seleniumhq.selenium</groupId>
<artifactId>selenium-chrome-driver</artifactId>
<version>3.0.1</version>
</dependency>
<dependency>
<groupId>org.seleniumhq.selenium</groupId>
<artifactId>selenium-remote-driver</artifactId>
<version>3.0.1</version>
</dependency>
<dependency>
<groupId>com.codeborne</groupId>
<artifactId>phantomjsdriver</artifactId>
<version>1.2.1</version>
</dependency>
<dependency>
<groupId>org.apache.commons</groupId>
<artifactId>commons-exec</artifactId>
<version>1.3</version>
</dependency>
<!-- https://mvnrepository.com/artifact/mysql/mysql-connector-java -->
<dependency>
<groupId>mysql</groupId>
<artifactId>mysql-connector-java</artifactId>
<version>5.1.6</version>
</dependency>
首先是config.properties
#数据库
db_url=jdbc:mysql://localhost:3306/ziyuan?useUnicode=true&characterEncoding=utf-8
db_username=root
db_password=962464
#表明
db_table=movie
#链接
dy=http://www.coupling.pw/fl/dy
pojo:
电影movie:
public class Movie {
private String name;
private String url;
private String createtime;
private String updatetime;
//信息
private String information;
//介绍
private String introduce;
//人工更新
private String manual;
//图片
private String image;
//类型
private String type;
//标签
private String label;
@Override
public String toString() {
return "Movie{" +
"name='" + name + '\'' +
", url='" + url + '\'' +
", createtime='" + createtime + '\'' +
", updatetime='" + updatetime + '\'' +
", information='" + information + '\'' +
", introduce='" + introduce + '\'' +
", manual='" + manual + '\'' +
", image='" + image + '\'' +
", type='" + type + '\'' +
", label='" + label + '\'' +
'}';
}
public Movie(String name, String url, String createtime, String updatetime, String information, String introduce, String manual, String image, String type, String label) {
this.name = name;
this.url = url;
this.createtime = createtime;
this.updatetime = updatetime;
this.information = information;
this.introduce = introduce;
this.manual = manual;
this.image = image;
this.type = type;
this.label = label;
}
public Movie() {
}
public void setName(String name) {
this.name = name;
}
public void setUrl(String url) {
this.url = url;
}
public void setCreatetime(String createtime) {
this.createtime = createtime;
}
public void setUpdatetime(String updatetime) {
this.updatetime = updatetime;
}
public void setInformation(String information) {
this.information = information;
}
public void setIntroduce(String introduce) {
this.introduce = introduce;
}
public void setManual(String manual) {
this.manual = manual;
}
public void setImage(String image) {
this.image = image;
}
public void setType(String type) {
this.type = type;
}
public void setLabel(String label) {
this.label = label;
}
public String getName() {
return name;
}
public String getUrl() {
return url;
}
public String getCreatetime() {
return createtime;
}
public String getUpdatetime() {
return updatetime;
}
public String getInformation() {
return information;
}
public String getIntroduce() {
return introduce;
}
public String getManual() {
return manual;
}
public String getImage() {
return image;
}
public String getType() {
return type;
}
public String getLabel() {
return label;
}
}
数据库DataSourceModel:
public class DataSourceModel {
private String url;
private String username;
private String password;
DataSourceModel(){
}
public String getUrl() {
return url;
}
public void setUrl(String url) {
this.url = url;
}
public String getUsername() {
return username;
}
public void setUsername(String username) {
this.username = username;
}
public String getPassword() {
return password;
}
public void setPassword(String password) {
this.password = password;
}
}
然后是工具类:
public class Utils {
//下载图片
public static String downImage(String url){
DataInputStream ds=null;
OutputStream os=null;
try {
URL imageurl=new URL(url);
ds=new DataInputStream(imageurl.openStream());
String extName=url.substring(url.lastIndexOf("."));
//创建图片名
String picName= UUID.randomUUID().toString()+extName;
//下载图片名
os= new FileOutputStream(new File("C:\\Users\\hasee\\Desktop\\http\\"+picName));
byte[] buffer = new byte[1024];
int length;
while ((length = ds.read(buffer)) > 0) {
os.write(buffer, 0, length);
}
return picName;
} catch (Exception e) {
e.printStackTrace();
}finally {
try {
if(ds!=null) {
ds.close();
}
if(os!=null) {
os.close();
}
} catch (IOException e) {
e.printStackTrace();
}
}
return "";
}
//加载配置文件
public static Properties loadConfig(String configFile) {
InputStream input = null;
Properties properties = new Properties();
try {
input = Utils.class.getResourceAsStream(configFile);
properties.load(input);
} catch (Exception e) {
System.out.println("配置文件加载失败");
} finally {
if(input != null) {
try {
input.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
return properties;
}
//jdbc连接数据库
public static Connection getConnection(DataSourceModel dataSourceModel){
Connection conn=null;
try {
Class.forName("com.mysql.jdbc.Driver");
conn= DriverManager.getConnection(dataSourceModel.getUrl(), dataSourceModel.getUsername(), dataSourceModel.getPassword());
} catch (ClassNotFoundException e) {
e.printStackTrace();
} catch (SQLException e) {
e.printStackTrace();
}
return conn;
}
//保存
public static void saveDb(Connection connection,String sql){
Statement statement=null;
try {
if(connection!=null){
statement=connection.createStatement();
statement.executeUpdate(sql);
}
} catch (Exception e) {
e.printStackTrace();
} finally {
try {
if(statement!=null)
statement.close();
} catch (SQLException e) {
e.printStackTrace();
}
}
}
//查询数据库是否存在(如果存在就要更新)
public static int excuteCountQuery(Connection connection,String sql){
int rowCount=0;
Statement statement=null;
ResultSet resultSet=null;
try {
statement=connection.createStatement();
resultSet=statement.executeQuery(sql);
while(resultSet.next()){
rowCount = resultSet.getInt("totalnum");
}
} catch (Exception e) {
e.printStackTrace();
} finally {
try {
if(resultSet!=null) {
resultSet.close();
}
if(statement!=null) {
statement.close();
}
} catch (SQLException e) {
e.printStackTrace();
}
}
return rowCount;
}
}
然后就是MyProcessor分析页面,
public class MyProcessor implements PageProcessor {
static Properties properties;
static DataSourceModel dataSourceModel;
static String table;
static String dy;
static {
properties = Utils.loadConfig("/config.properties");
dataSourceModel = new DataSourceModel();
dataSourceModel.setUrl(properties.getProperty("db_url"));
dataSourceModel.setUsername(properties.getProperty("db_username"));
dataSourceModel.setPassword(properties.getProperty("db_password"));
table = properties.getProperty("db_table");
dy = properties.getProperty("dy");
}
private Site site=Site.me()
.setCharset("utf8")//设置编码
.setTimeOut(5000)//设置超时时间
.setRetrySleepTime(5000)//重试间隔时间
.setSleepTime(3);//设置重试次数
public MyProcessor() {
}
@Override
public Site getSite() {
return site;
}
/**
* 爬取页面的url
* @param page:链接得到的页面
*/
@Override
public void process(Page page) {
try {
Thread.sleep(5000);
} catch (InterruptedException e) {
e.printStackTrace();
}
//得到当前页面每个链接
List<Selectable> list=page.getHtml().css("article a.entry-thumb.entry-cover").nodes();
//如果list为空,表示该链接是进入的页面
if(list==null||list.size()==0){
try {
Movie movie=ananyDetail(page);
page.putField("movie",movie);
} catch (Exception e) {
e.printStackTrace();
}
}else{
//爬取url放入队列
for (Selectable selectable:list){
//toString()等同于get(0)
String url=selectable.links().toString();
//放入ResultItems队列,然后在MyPipelines里取出来进行保存处理
page.addTargetRequest(url);
}
//下一页的链接放入队列里待爬取
Selectable nextSelectable=page.getHtml().css("nav.pagination a.next").nodes().get(0);
if(nextSelectable!=null){
String nextUrl=nextSelectable.links().toString();
System.out.println("下一页:"+nextUrl);
page.addTargetRequest(nextUrl);
}
}
}
/**
* 页面分析
* @param page
* @throws Exception
*/
public static Movie ananyDetail(Page page) throws Exception {
Thread.sleep(1000);//1s
//由于我的数据没爬完,防止数据为空保存时报错,所以就这样初始化哈~~~
Movie movie=new Movie("","","","","","","","","","");
Html html=page.getHtml();
String text1=html.css("header.entry-header h1","text").nodes().get(0).toString();
//[超时空心跳][2019][爱情 / 中国]
String text2=text1.
trim().//去空格
replace("][","/");//替换
String[] str=text2.substring(1,text2.length()-1).//删除第一个字符和最后一个字符
split("/");
String label="";
if(str!=null&&str.length>1) {
for (int i=0;i<str.length;i++){
if(i==0){
//保存电影名
movie.setName(str[i]);
}else{
//添加标签
label=label+"/"+str[i];
}
}
}
//保存标签
movie.setLabel(label);
//保存信息
String information=html.css("div.entry-content p","text").nodes().get(0).toString();
//毕竟只是一个demo,information的长度可能超过数据库长度而报错,所以暂且这样判断一下
if(information.length()<250) {
movie.setInformation(information);
}
String image=html.css("div.entry-content p img","src").toString();
//下载图片
String imageName=Utils.downImage(image);
//保存图片
movie.setImage(imageName);
//还有其他信息就不爬了
return movie;
}
public static void main(String[] args){
Spider spider1=Spider.create(new MyProcessor());
spider1.addUrl(dy)
.addPipeline(new MyPipelines(dataSourceModel,table))//用MyPipelines类处理结果保存在数据库
.thread(3)//开启3个线程
.run();
}
}
然后是MyPipelines对爬去到的数据进行保存
public class MyPipelines implements Pipeline {
private DataSourceModel dataSourceModel;
private String table;
public MyPipelines() {
}
public MyPipelines(DataSourceModel dataSourceModel, String table) {
this.dataSourceModel = dataSourceModel;
this.table = table;
}
@Override
public void process(ResultItems resultItems, Task task) {
Movie movie=resultItems.get("movie");
if(movie!=null) {
System.out.println(movie.toString());
saveDb(movie);
}
}
public void saveDb(Movie movie){
Connection connection=null;
try {
//入数据库
connection=Utils.getConnection(dataSourceModel);
//先查询是否存在
SimpleDateFormat sdf=new SimpleDateFormat("yyyy-MM-dd");
String querySql="select count(1) as totalnum from "+table+" where name='#name'";
querySql=querySql.replace("#name",movie.getName());
int count=Utils.excuteCountQuery(connection,querySql);
if(count<=0){
//插入
String sql="insert into "+table+" (name,url,createtime,information,introduce,manual,image,type,label) values ('#name','#url','#createtime','#information','#introduce','#manual','#image','#type','#label')";
sql=sql.replace("#name",movie.getName())
.replace("#url",movie.getUrl())
.replace("#createtime",sdf.format(new Date()))
.replace("#information",movie.getInformation())
.replace("#introduce",movie.getIntroduce())
.replace("#manual",movie.getManual())
.replace("#image",movie.getImage())
.replace("#type",movie.getType())
.replace("#label",movie.getLabel());
Utils.saveDb(connection,sql);
}else{
//更新
String updateSql="update "+table+" set url='#url',updatetime='#updatetime' where name='#name' and manual='0'";
updateSql=updateSql.replace("#name",movie.getName())
.replace("#url",movie.getUrl())
.replace("#updatetime",sdf.format(new Date()));
Utils.saveDb(connection,updateSql);
}
} catch (Exception e) {
System.out.println("入库失败");
e.printStackTrace();
}finally {
if(connection!=null){
try {
connection.close();
} catch (SQLException e) {
e.printStackTrace();
}
}
}
}
}
这只是一个例子,代码已经注解的很细了,至于知识点就在其他地方看吧,等有时间了再回来改一下。 |