java抓取qq消息_Java代码---实现爬取腾讯新闻

论坛 期权论坛 编程之家     
选择匿名的用户   2021-5-30 14:33   86   0

环境准备:

com.alibaba

druid

1.1.21

com.google.code.gson

gson

2.8.5

org.springframework

spring-jdbc

5.2.2.RELEASE

mysql

mysql-connector-java

5.1.47

定义pojo接收

private static final long serialVersionUID = 1L;

private int id;

private String title;

private String intro;

private String url;

private String source;

private Date publishTime;

代码爬取数据:

static JdbcTemplate jdbcTemplate = null;

public static void main(String[] args) throws IOException, ParseException {

//加载外部属性文件

Properties properties = new Properties();

InputStream inputStream = new FileInputStream(new File("src/main/resources/db.properties"));

properties.load(inputStream);

//获得数据库属性

String driver = properties.getProperty("jdbc.driverClass");

String url = properties.getProperty("jdbc.url");

String username = properties.getProperty("jdbc.username");

String password = properties.getProperty("jdbc.password");

//System.out.println(driver+"=="+url+"=="+username+"=="+password);

DruidDataSource dataSource = new DruidDataSource();

dataSource.setDriverClassName(driver);

dataSource.setUrl(url);

dataSource.setUsername(username);

dataSource.setPassword(password);

jdbcTemplate =new JdbcTemplate(dataSource);

//设置起始页

int page = 1;

while (true) {

String urlTencent ="https://pacaio.match.qq.com/irs/rcd?cid=135&token=6e92c215fb08afa901ac31eca115a34f&ext=world&page="+page+"&expIds=&callback=__jp4";

//确定路径

//String urlTencent = "https://pacaio.match.qq.com/irs/rcd?cid=89&token=4d4e2946f92c5708f32141479596d72e&id=&ext=bj&page="+page+"&expIds=&callback=__jp0";

CloseableHttpClient httpClient = HttpClients.createDefault();

HttpGet httpGet = new HttpGet(urlTencent);

httpGet.setHeader("user-agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36");

CloseableHttpResponse httpResponse = httpClient.execute(httpGet);

int statusCode = httpResponse.getStatusLine().getStatusCode();

if (statusCode==200) {

HttpEntity httpEntity = httpResponse.getEntity();

Gson gson = new Gson();

//转换

String html = EntityUtils.toString(httpEntity);

//得到json

String json = parseJson(html);

//转换成map

Map map = gson.fromJson(json, Map.class);

//判断有多少数据,然后退出循环

Object num = map.get("datanum");

String nums = num.toString();

Double double1 = Double.parseDouble(nums);

int number = double1.intValue();

if (number==0) {

break;

}

//得到页面的data

@SuppressWarnings("unchecked")

List list = (List) map.get("data");

//遍历集合

for (Map map2 : list) {

SimpleDateFormat simpleDateFormat = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");

Tencent tencent = new Tencent();

String title = map2.get("title").toString();

String intro = map2.get("intro").toString();

String turl = map2.get("url").toString();

String source = map2.get("source").toString();

Date publishTime = simpleDateFormat.parse(map2.get("publish_time").toString());

tencent.setTitle(title);

tencent.setUrl(turl);

tencent.setIntro(intro);

tencent.setSource(source);

tencent.setPublishTime(publishTime);

addNews(tencent);

}

}

page++;

}

}

public static void addNews(Tencent tencent) {

String sql = "insert into t_tencent (title,intro,url,source,publish_time) values (?,?,?,?,?)";

jdbcTemplate.update(sql,new Object[] {tencent.getTitle(),tencent.getIntro(),tencent.getUrl(),tencent.getSource(),tencent.getPublishTime()});

}

public static String parseJson(String data) {

int start = data.indexOf("(");

int end = data.lastIndexOf(")");

String html = data.substring(start+1, end);

return html;

}

结果如下:

bb473a95fb7d34295f99fd97fa493752.png

分享到 :
0 人收藏
您需要登录后才可以回帖 登录 | 立即注册

本版积分规则

积分:3875789
帖子:775174
精华:0
期权论坛 期权论坛
发布
内容

下载期权论坛手机APP