[Python] 纯文本查看 复制代码#-*- coding utf-8 -*-
# coder Thending
# python3.6.4
#以下是需要用到的库
import requests
import json
import re
import pandas
import time
from bs4 import BeautifulSoup
from datetime import datetime
while 1:
y = input('请问要获取几页新浪国内新闻(一页22个):')
if y.isdigit():
print("下载中···")
break
else:
print("输入错误,请重新输入")
continue
x = int(y)
#评论js链接
commentURL = 'http://comment5.news.sina.com.cn/page/info?version=1&format=json&channel=gn&newsid=comos-{}&group=undefined&compress=0&ie=utf-8&oe=utf-8&page=1&page_size=3&t_size=3&h_size=3&thread=1'
#页面加载链接
url = 'http://api.roll.news.sina.com.cn/zt_list?channel=news&cat_1=gnxw&cat_2==gdxw1||=gatxw||=zs-pl||=mtjj&level==1||=2&show_ext=1&show_all=1&show_num=22&tag=1&format=json&page={}&callback=newsloadercallback&_=1521876473467'
def getCommentsCounts(newsurl):
m = re.search('doc-i(.*).shtml', newsurl)
newsid = m.group(1)
comments = requests.get(commentURL.format(newsid))
jd = json.loads(comments.text.strip('var data='))
return jd['result']['count']['total']
def getNewsDetail(newsurl):
result = {}
res = requests.get(newsurl)
res.encoding = 'utf-8'
soup = BeautifulSoup(res.text, 'html.parser')
time = soup.select('.date')[0].text
times = datetime.strptime(time, '%Y年%m月%d日 %H:%M')
result['时间'] = times.strftime("%Y-%m-%d %H:%M")
result['来源'] = soup.select('.source')[0].text
result['内容'] = ' '.join([p.text.strip() for p in soup.select('.article p')[:-2]])
result['标题'] = soup.select('.main-title')[0].text
result['作者'] = soup.select('.show_author')[0].text.strip('责任编辑:')
result['评论数'] = getCommentsCounts(newsurl)
result['链接'] = newsurl
return result
def parseListLinks(url):
newsdetails = []
res = requests.get(url)
jd = json.loads(res.text.lstrip(' newsloadercallback(').rstrip(');'))
for ent in jd['result']['data']:
newsdetails.append(getNewsDetail(ent['url']))
return newsdetails
news_total = []
for i in range(1, 1+x):
newsurl = url.format(i)
newsary = parseListLinks(newsurl)
news_total.extend(newsary)
df = pandas.DataFrame(news_total)
df.to_csv("sina_news.csv", encoding="utf_8_sig")
if len(news_total) > 0 :
print("已在当前目录下生成CSV文件!")
time.sleep(3)